Skip to content

Update docker/build-push-action digest to 53b7df9 #3748

Update docker/build-push-action digest to 53b7df9

Update docker/build-push-action digest to 53b7df9 #3748

Workflow file for this run

name: CI
on:
workflow_dispatch:
merge_group: {}
push:
branches: [main]
paths-ignore:
# NOTE: pyproject.toml IS NOT ignored. Auto-release uses a PR-based
# bump flow (auto-release.yml → opens auto/bump-vX.Y.Z PR with
# auto-merge); the resulting squash-merge to main carries only a
# pyproject.toml diff and MUST trigger CI so the workflow_run
# listener in auto-release.yml can fire and tag + publish. The old
# CI→release→CI loop concern is already handled by the bot-author
# filter and the "tag exists?" check inside auto-release.yml.
# Documentation & prose
- "docs/**"
- "!docs/operations/ci-topology.md"
- "!docs/observability/**"
- "*.md"
- "!README.md"
- "LICENSE"
- "CONTRIBUTORS.md"
# Runtime state (never committed)
- ".sdd/**"
# Non-Python packages & SDKs
- "sdk/typescript/**"
- "packages/vscode/**"
- "packages/cursor-plugin/**"
- "packaging/**"
- "Formula/**"
# Deployment & infra configs
- "deploy/**"
- "docker/**"
- "docker-compose.yaml"
- "Dockerfile"
- "action.yml"
- "action/**"
# CI tool configs (don't re-run tests for codecov/sonar tweaks)
- "codecov.yml"
- "sonar-project.properties"
# GitHub meta (templates, labels, funding - not ci.yml)
- ".github/ISSUE_TEMPLATE/**"
- ".github/FUNDING.yml"
- ".github/CODEOWNERS"
- ".github/pull_request_template.md"
- ".github/dependabot.yml"
- ".github/labeler.yml"
- ".github/release-drafter.yml"
- ".github/copilot-instructions.md"
- ".github/codeql/**"
# Non-code project files
- "marketing/**"
- "benchmarks/**"
- "examples/**"
- "plans/**"
- "agents/**"
- "commands/**"
- "rules/**"
- ".bernstein/**"
- ".plugin/**"
- "scripts/gen_tickets_*.py"
- "scripts/gen_roadmap_*.py"
- "scripts/generate_benchmark_docs.py"
pull_request:
paths-ignore:
# Documentation & prose
- "docs/**"
- "!docs/operations/ci-topology.md"
- "!docs/observability/**"
- "*.md"
- "!README.md"
- "LICENSE"
- "CONTRIBUTORS.md"
# Runtime state (never committed)
- ".sdd/**"
# Non-Python packages & SDKs
- "sdk/typescript/**"
- "packages/vscode/**"
- "packages/cursor-plugin/**"
- "packaging/**"
- "Formula/**"
# Deployment & infra configs
- "deploy/**"
- "docker/**"
- "docker-compose.yaml"
- "Dockerfile"
- "action.yml"
- "action/**"
# CI tool configs
- "codecov.yml"
- "sonar-project.properties"
# GitHub meta
- ".github/ISSUE_TEMPLATE/**"
- ".github/FUNDING.yml"
- ".github/CODEOWNERS"
- ".github/pull_request_template.md"
- ".github/dependabot.yml"
- ".github/labeler.yml"
- ".github/release-drafter.yml"
- ".github/copilot-instructions.md"
- ".github/codeql/**"
# Non-code project files
- "marketing/**"
- "benchmarks/**"
- "examples/**"
- "plans/**"
- "agents/**"
- "commands/**"
- "rules/**"
- ".bernstein/**"
- ".plugin/**"
- "scripts/gen_tickets_*.py"
- "scripts/gen_roadmap_*.py"
- "scripts/generate_benchmark_docs.py"
# Concurrency policy for heavy CI (see #1273):
#
# - Pull requests: per-PR group (keyed off pull_request.number, stable
# across pushes to the same PR), cancel-in-progress=true. New commits
# on the same PR cancel older CI runs so reviewers only ever wait on
# the latest push and we don't burn minutes on stale SHAs.
#
# - Pushes to main: branch-scoped group, cancel-in-progress=true.
# A rapid wave of merges can cancel older heavy CI runs on main so
# the latest push supersedes stale ones. Per-SHA main observability is
# provided separately by main-sha-marker.yml, which is keyed by
# github.sha and is not cancellable by newer main pushes.
#
# The conditional in `group:` selects the right key per event type, and
# `cancel-in-progress` fires for both pull_request and push: a rapid
# wave of merges on `main` (13 commits in 90 min during the May 2026
# META wave) used to saturate the runner queue because each sha-unique
# group kept its own full-matrix run alive. Branch-scoped grouping +
# always-cancel-in-progress lets the latest push supersede stale ones.
concurrency:
group: ci-${{ github.workflow }}-${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || format('branch-{0}', github.ref) }}
cancel-in-progress: true
# Default-deny for the workflow token; individual jobs escalate only
# the scopes they actually need (Scorecard token-permissions, Sonar S8264).
permissions:
contents: read
jobs:
# ─── Planner (determines which downstream jobs may legitimately skip) ──
#
# Inspired by pypa/pip's CI: a planner job classifies the PR (or push)
# diff and emits boolean outputs. Downstream skips are then either
# "intentional" (planner said so) or suspicious (cancelled / crashed).
# The aggregator gate at the bottom uses these outputs to distinguish
# the two and refuses to pass on suspicious skips. See #1273.
determine-changes:
name: Determine changes
runs-on: ubuntu-latest
timeout-minutes: 3
permissions:
contents: read
outputs:
python_changed: ${{ steps.classify.outputs.python_changed }}
tests_changed: ${{ steps.classify.outputs.tests_changed }}
gha_workflows_changed: ${{ steps.classify.outputs.gha_workflows_changed }}
docs_only: ${{ steps.classify.outputs.docs_only }}
# macos_sensitive: true when the diff touches platform-specific code
# paths whose macOS behaviour cannot be exercised on ubuntu/windows
# runners. Used by the `test` matrix gate (see #1468) to skip the
# macos-latest cells on PRs that do not need them, freeing the
# macOS hosted-runner pool during burst-merge waves. The nightly
# workflow (ci-macos-nightly.yml) provides the safety-net coverage.
macos_sensitive: ${{ steps.classify.outputs.macos_sensitive }}
steps:
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
fetch-depth: 0
- id: classify
name: Classify changed paths
env:
BASE_REF: ${{ github.base_ref }}
EVENT_NAME: ${{ github.event_name }}
run: |
# On pull_request, diff against the base branch. On push to main
# (or any other push event), diff against the previous commit so
# the planner stays useful for filter-skipped downstream jobs.
#
# NOTE: checkout above uses fetch-depth: 0, so HEAD has full history.
# However, origin/${BASE_REF} is NOT fetched by default in the PR
# checkout (which is on the merge ref) - we must fetch it explicitly.
# Using --depth=1 here previously caused `...` (merge-base) diffs to
# fail when the merge-base wasn't in the shallow window, especially
# after parallel main merges. Fetch the base ref WITHOUT --depth.
CHANGED=""
diff_failed=false
if [ "$EVENT_NAME" = "pull_request" ]; then
# Fetch base ref with full history (no --depth) so merge-base resolves.
git fetch --no-tags origin "refs/heads/${BASE_REF}:refs/remotes/origin/${BASE_REF}" || true
# If the local clone is still shallow for any reason, unshallow it.
if [ -f "$(git rev-parse --git-dir)/shallow" ]; then
git fetch --unshallow origin "${BASE_REF}" || git fetch origin "${BASE_REF}" || true
fi
# Diagnostic output - keeps future flakes debuggable in the action log.
echo "::group::git diagnostic"
git remote -v || true
git log --oneline -5 || true
echo "origin/${BASE_REF} -> $(git rev-parse "origin/${BASE_REF}" 2>&1 || echo UNRESOLVED)"
echo "HEAD -> $(git rev-parse HEAD 2>&1 || echo UNRESOLVED)"
echo "::endgroup::"
if ! CHANGED=$(git diff --name-only "origin/${BASE_REF}...HEAD" 2>&1); then
echo "::warning::git diff failed against origin/${BASE_REF}; falling back to safe over-broad result"
echo "diff stderr: $CHANGED"
diff_failed=true
fi
else
# `before` may be 000... on first push of a branch; fall back to HEAD~1.
BEFORE="${{ github.event.before }}"
if [ -z "$BEFORE" ] || [ "$BEFORE" = "0000000000000000000000000000000000000000" ]; then
CHANGED=$(git diff --name-only "HEAD~1...HEAD" 2>/dev/null || git ls-files)
else
CHANGED=$(git diff --name-only "${BEFORE}...HEAD" 2>/dev/null || git ls-files)
fi
fi
# Fail-safe fallback: when diff fails for any reason, emit the safe
# over-broad classification so downstream jobs run anyway. Correctness
# wins over efficiency - never fail this job for a clone-shape issue.
if [ "$diff_failed" = "true" ]; then
{
echo "python_changed=true"
echo "tests_changed=true"
echo "gha_workflows_changed=true"
echo "docs_only=false"
echo "macos_sensitive=true"
} | tee -a "$GITHUB_OUTPUT"
exit 0
fi
echo "Changed files:"
printf '%s\n' "$CHANGED" | sed 's/^/ /'
# Pure-shell classification - auditable in `actionlint`, no
# sub-shell variable round-tripping through python.
python_changed=false
tests_changed=false
gha_workflows_changed=false
docs_only=true
macos_sensitive=false
# Classify each changed path via grep. Using grep instead of
# bash `case` to avoid linter warnings on overlapping patterns
# (case-globs cannot cross slashes anyway).
while IFS= read -r f; do
[ -z "$f" ] && continue
matched_meta=false
if printf '%s\n' "$f" | grep -Eq '^src/.*\.py$'; then
python_changed=true; docs_only=false; matched_meta=true
fi
if printf '%s\n' "$f" | grep -Eq '^tests/'; then
tests_changed=true; docs_only=false; matched_meta=true
fi
if printf '%s\n' "$f" | grep -Eq '^\.github/workflows/.*\.(yml|yaml)$'; then
gha_workflows_changed=true; docs_only=false; matched_meta=true
fi
if printf '%s\n' "$f" | grep -Eq '^docs/|\.md$|^LICENSE$|^CONTRIBUTORS\.md$'; then
matched_meta=true
fi
if [ "$matched_meta" = "false" ]; then
docs_only=false
fi
# macOS-sensitive paths (see #1468). Modules with branches on
# `sys.platform == "darwin"` or that wrap macOS-only APIs
# (Keychain via `keyring`, AppKit notifications, Foundation
# clipboard, `launchd` daemon installer). When any of these
# change, the macOS matrix cell must run on the PR to catch
# regressions before merge. Otherwise it skips and the
# nightly ci-macos-nightly.yml workflow catches drift.
if printf '%s\n' "$f" | grep -Eq '^src/bernstein/core/tunnels/'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^src/bernstein/core/daemon/'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^src/bernstein/core/config/platform_compat\.py$'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^src/bernstein/core/security/vault/'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^src/bernstein/core/security/resource_limits\.py$'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^src/bernstein/core/persistence/runtime_state\.py$'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^src/bernstein/core/communication/notifications\.py$'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^src/bernstein/core/preview/'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^src/bernstein/tui/clipboard\.py$'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^src/bernstein/cli/display/splash_screen\.py$'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^src/bernstein/bridges/openclaw_gateway\.py$'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^tests/integration/test_adapter_e2e\.py$'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^scripts/run_tests\.py$'; then
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^\.github/workflows/ci\.yml$'; then
# Changes to the CI workflow itself must re-validate the
# macOS cell on the PR so we never ship a broken matrix
# config (the nightly workflow runs the *merged* config).
macos_sensitive=true
elif printf '%s\n' "$f" | grep -Eq '^\.github/workflows/ci-macos-nightly\.yml$'; then
macos_sensitive=true
fi
done <<< "$CHANGED"
# If nothing changed at all (e.g. workflow_dispatch on a clean ref),
# treat as "not docs-only" so we don't intentionally skip tests.
if [ -z "$CHANGED" ]; then
docs_only=false
fi
echo "python_changed=$python_changed" | tee -a "$GITHUB_OUTPUT"
echo "tests_changed=$tests_changed" | tee -a "$GITHUB_OUTPUT"
echo "gha_workflows_changed=$gha_workflows_changed" | tee -a "$GITHUB_OUTPUT"
echo "docs_only=$docs_only" | tee -a "$GITHUB_OUTPUT"
echo "macos_sensitive=$macos_sensitive" | tee -a "$GITHUB_OUTPUT"
# ─── Fast checks (never cancelled, <2 min each) ───────────────────────
repo-hygiene:
name: Repo hygiene
runs-on: ubuntu-latest
timeout-minutes: 5
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
# Full clone so ``bernstein agents-md verify`` can resolve the
# default branch via ``git rev-parse --verify origin/main``.
# The default depth=1 single-ref fetch leaves origin/main
# unfetched and the verify step silently drops the
# git-workflow section, producing drift against locally-synced
# content.
fetch-depth: 0
- name: Establish origin/HEAD
# actions/checkout does not set ``refs/remotes/origin/HEAD`` even
# with full fetch. The generator's first-choice resolver reads
# that symbolic-ref; this step makes the result match a
# developer checkout where ``git remote set-head origin -a``
# has run.
run: git remote set-head origin -a
- name: Assert .sdd is not tracked
run: |
TRACKED="$(git ls-files '.sdd')"
if [ -n "$TRACKED" ]; then
echo "::error::.sdd must never be committed to git"
printf '%s\n' "$TRACKED"
exit 1
fi
- name: Check for merge conflict markers in source files
run: |
CONFLICTS=""
for f in $(git ls-files -- '*.py' '*.yaml' '*.yml' '*.md' '*.toml'); do
if grep -qE '^(<{7} |>{7} )' "$f" 2>/dev/null; then
CONFLICTS="$CONFLICTS $f"
echo "::error file=$f::Unresolved merge conflict markers in $f"
fi
done
if [ -n "$CONFLICTS" ]; then
exit 1
fi
- name: Check Python syntax in scripts/
run: |
ERRORS=""
for f in scripts/*.py; do
if ! python3 -m py_compile "$f" 2>/dev/null; then
ERRORS="$ERRORS $f"
echo "::error file=$f::Syntax error in $f"
fi
done
if [ -n "$ERRORS" ]; then
exit 1
fi
- uses: ./.github/actions/bootstrap
- name: AGENTS.md cross-CLI sync drift check
# Fails if AGENTS.md / CLAUDE.md / CONVENTIONS.md / .aider.conf.yml /
# .goosehints / .cursor/rules/*.mdc drift from `bernstein agents-md
# generate`. Run `uv run bernstein agents-md sync` locally to fix.
run: uv run bernstein agents-md verify --workdir .
lint:
name: Lint
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- run: uv run ruff check src/
- run: uv run ruff format --check src/
- name: Architecture contracts (import-linter)
run: uv run lint-imports
- name: Route broad-except policy (#1723)
# Fails if a bare `except Exception:` appears in
# src/bernstein/core/routes/**.py without a `bot-ack:` or
# `intentional-broad-except` marker within 3 lines.
run: uv run python scripts/check_routes_broad_except.py
spelling:
name: Spelling (typos)
runs-on: ubuntu-latest
timeout-minutes: 5
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: crate-ci/typos@bee27e3a4fd1ea2111cf90ab89cd076c870fce14 # v1
actionlint:
name: Workflow lint
runs-on: ubuntu-latest
timeout-minutes: 5
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: reviewdog/action-actionlint@6fb7acc99f4a1008869fa8a0f09cfca740837d9d # v1
with:
reporter: github-check
level: error
fail_level: error
actionlint_flags: -shellcheck=
# `actionlint_flags: -shellcheck=` disables the embedded
# shellcheck (empty path = disabled). Without this our workflows
# emit ~21 SC2016/SC2221/SC2222/SC2034 warnings that flood the
# 22-annotation GitHub check_run cap, which is itself reported
# as an error and fails reviewdog. `level` is a severity tag,
# not a filter, so it alone does not stop the cap being hit.
# `fail_on_error` is deprecated; `fail_level: error` replaces it.
lineage-gate:
# ADR-009 Lineage Gate - required check. CI generates a minimal signed
# lineage fixture and verifies it so the job always exercises the gate
# logic even when runtime `.sdd/lineage/log.jsonl` is absent.
name: Lineage Gate
runs-on: ubuntu-latest
timeout-minutes: 5
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Run lineage gate
run: |
set -euo pipefail
LINEAGE_FIXTURE="${RUNNER_TEMP}/lineage-fixture"
export LINEAGE_FIXTURE
uv run python - <<'PY'
import hashlib
import json
import os
from pathlib import Path
from bernstein.core.lineage.entry import LineageEntry, canonicalise, entry_hash
from bernstein.core.lineage.identity import generate_keypair, sign_detached
root = Path(os.environ["LINEAGE_FIXTURE"])
log = root / "lineage" / "log.jsonl"
cards = root / "agents" / "agent:ci"
log.parent.mkdir(parents=True, exist_ok=True)
cards.mkdir(parents=True, exist_ok=True)
private_key, public_key = generate_keypair()
(cards / "card.json").write_text(
json.dumps(
{
"protocolVersion": "a2a/1.0",
"agent_id": "agent:ci",
"kid": "ci-fixture",
"public_key_pem": public_key,
}
),
encoding="utf-8",
)
entry = LineageEntry(
v=1,
artefact_path="ci/lineage-fixture.txt",
artefact_kind="file",
content_hash="sha256:" + ("1" * 64),
parent_hashes=[],
agent_id="agent:ci",
agent_card_kid="ci-fixture",
tool_call_id="ci-lineage-gate",
span_id="ci-lineage-gate",
ts_ns=1,
operator_hmac="0" * 64,
)
canonical = canonicalise(entry)
log.write_bytes(canonical + b"\n")
jws = sign_detached(canonical, private_key, kid="ci-fixture")
path_hash = hashlib.sha256(entry.artefact_path.encode()).hexdigest()
entry_digest = entry_hash(entry).replace("sha256:", "")
sig_dir = log.parent / "signatures" / path_hash[:2] / path_hash
sig_dir.mkdir(parents=True, exist_ok=True)
(sig_dir / f"{entry_digest}.jws").write_text(jws, encoding="utf-8")
PY
uv run python scripts/check_lineage.py \
--log "${LINEAGE_FIXTURE}/lineage/log.jsonl" \
--cards "${LINEAGE_FIXTURE}/agents"
# ─── Medium checks (cancel old runs, 5-20 min) ────────────────────────
typecheck:
name: Type check report
needs: [lint] # only run if lint passes (fast-fail)
runs-on: ubuntu-latest
timeout-minutes: 20
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Run pyright (advisory)
run: |
uv run pyright 2>&1 | tail -1 || true
echo "::notice::Typecheck is advisory while module decomposition shims are being typed"
dead-code:
name: Dead code (Vulture)
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- run: uv tool install vulture
- run: vulture src/ vulture_whitelist.py --min-confidence 80 --exclude "tests,docs"
dist-size:
name: Package size check
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Build and check size
run: |
uv build
MAX_SIZE=$((10 * 1024 * 1024))
for f in dist/*.whl; do
SIZE=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f")
echo "$f: $SIZE bytes"
if [ "$SIZE" -gt "$MAX_SIZE" ]; then
echo "::error::Wheel $f exceeds 10MB limit ($SIZE bytes)"
exit 1
fi
done
- name: Upload built wheel for downstream install-smoke jobs
# Shared artifact consumed by install-smoke-pipx and
# install-smoke-uv. Building once and reusing avoids running
# `uv build` on every matrix cell (6 + 2 = 8 cells today) and
# ensures every smoke runs against bit-identical wheels.
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: install-smoke-wheel
path: dist/*.whl
if-no-files-found: error
retention-days: 1
# ─── Install-path smoke (built wheel, not editable) ──────────────────
#
# The two install paths the README documents first - pipx and
# `uv tool install` - have no other coverage that exercises the
# *built* wheel end to end. Editable installs (`pip install -e .`)
# hide a class of packaging bugs: missing package-data, broken
# `console_scripts`, entry-point loading errors, dependency-resolver
# regressions. These jobs install from the wheel produced by
# `dist-size`, then run `bernstein --version`, `bernstein --help`,
# and `bernstein doctor --json` to confirm the dominant install path
# documented first in README still works end to end.
install-smoke-pipx:
name: Install smoke - pipx (${{ matrix.os }}, Python ${{ matrix.python-version }})
needs: [dist-size]
runs-on: ${{ matrix.os }}
timeout-minutes: 15
permissions:
contents: read
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
# Matrix tracks `requires-python = ">=3.12"` in pyproject.toml.
# 3.11 is intentionally excluded: pipx / uv install would refuse
# the wheel for a Python the package does not support, which
# would just confirm the floor we already pin.
python-version: ["3.12", "3.13"]
steps:
- name: Harden runner (audit mode)
if: runner.os == 'Linux'
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
with:
python-version: ${{ matrix.python-version }}
- name: Download built wheel
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
name: install-smoke-wheel
path: dist
- name: Fail fast if wheel exceeds 25 MB
# Independent of the 10 MB gate in `dist-size`: that one tracks
# day-to-day growth and is tuned tight. This one is the smoke
# job's own hard ceiling - catches accidental bundling
# regressions (binary blobs, vendored deps) that would slow
# pipx installs for everyone.
shell: bash
run: |
set -euo pipefail
MAX_SIZE=$((25 * 1024 * 1024))
shopt -s nullglob
wheels=(dist/*.whl)
if [ "${#wheels[@]}" -eq 0 ]; then
echo "::error::no wheel found under dist/"
exit 1
fi
for f in "${wheels[@]}"; do
SIZE=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f")
echo "$f: $SIZE bytes"
if [ "$SIZE" -gt "$MAX_SIZE" ]; then
echo "::error::wheel $f exceeds 25 MB install-smoke ceiling ($SIZE bytes)"
exit 1
fi
done
- name: Install uv (SHA-pinned, vendors pipx)
# Scorecard pinned-dependencies: pip cannot be hash-pinned for a
# single bootstrap step without a maintained requirements file,
# so we route the pipx install through SHA-pinned uv instead.
# `uv tool install pipx` puts pipx on PATH via uv's tool dir.
uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0
with:
enable-cache: true
- name: Install pipx via uv
shell: bash
run: |
set -euo pipefail
uv tool install pipx
# Expose uv-managed tool shims and pipx's own bin dir on PATH.
UV_TOOL_BIN_DIR="$(uv tool dir --bin)"
echo "$UV_TOOL_BIN_DIR" >> "$GITHUB_PATH"
echo "$HOME/.local/bin" >> "$GITHUB_PATH"
"$UV_TOOL_BIN_DIR/pipx" ensurepath
- name: pipx install the built wheel
# Install from the wheel, never editable. Editable installs
# bypass package-data inclusion and entry-point registration -
# the exact regression class this job is here to catch.
shell: bash
run: |
set -euo pipefail
wheels=(dist/*.whl)
pipx install --python "$(which python)" "${wheels[0]}"
- name: bernstein --version (exit zero)
shell: bash
run: bernstein --version
- name: bernstein --help (exit zero)
shell: bash
run: bernstein --help
- name: Verify packaged resources load via importlib
# Confirms package-data survived the wheel build by reading a
# bundled MCP tool schema and a force-included default template
# through importlib.resources. Uses the pipx-managed interpreter
# so we exercise the same site-packages layout end users get.
# We deliberately avoid `bernstein doctor --json` here: doctor
# is a dev-environment diagnostic that probes optional tools
# (uv, ruff, pytest, pyright) and git context, neither of which
# exists in a fresh pipx venv. This probe is narrowly scoped to
# the regression class the smoke job is meant to catch.
shell: bash
run: |
set -euo pipefail
BERNSTEIN_PYTHON="$(pipx environment --value PIPX_LOCAL_VENVS)/bernstein/bin/python"
"$BERNSTEIN_PYTHON" -c '
import importlib
import importlib.resources as ir
cli_mod = importlib.import_module("bernstein.cli.main")
assert callable(getattr(cli_mod, "cli")), "bernstein.cli.main:cli missing"
schema_pkg = ir.files("bernstein.mcp.tool_schemas")
schemas = [p.name for p in schema_pkg.iterdir() if p.name.endswith(".json")]
assert schemas, "no MCP tool schemas shipped in wheel"
tpl_pkg = ir.files("bernstein._default_templates")
assert any(tpl_pkg.iterdir()), "no default templates shipped in wheel"
print(f"packaged resources OK: {len(schemas)} MCP schemas")
'
install-smoke-uv:
# Leaner mirror of install-smoke-pipx for the `uv tool install`
# path. uv is the second install command documented in the README
# and rounds out coverage for the two paths most likely to surface
# packaging regressions. We run a smaller matrix (one Python
# version per OS) because the pipx job already exercises the
# cross-Python combinatorics; uv shares the same wheel and
# console-scripts entry point, so the marginal coverage of a full
# 6-cell matrix is not worth the runner spend.
name: Install smoke - uv tool (${{ matrix.os }})
needs: [dist-size]
runs-on: ${{ matrix.os }}
timeout-minutes: 15
permissions:
contents: read
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
steps:
- name: Harden runner (audit mode)
if: runner.os == 'Linux'
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
with:
python-version: "3.14"
- name: Download built wheel
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
name: install-smoke-wheel
path: dist
- name: Fail fast if wheel exceeds 25 MB
shell: bash
run: |
set -euo pipefail
MAX_SIZE=$((25 * 1024 * 1024))
shopt -s nullglob
wheels=(dist/*.whl)
if [ "${#wheels[@]}" -eq 0 ]; then
echo "::error::no wheel found under dist/"
exit 1
fi
for f in "${wheels[@]}"; do
SIZE=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f")
echo "$f: $SIZE bytes"
if [ "$SIZE" -gt "$MAX_SIZE" ]; then
echo "::error::wheel $f exceeds 25 MB install-smoke ceiling ($SIZE bytes)"
exit 1
fi
done
- name: uv tool install the built wheel
# `uv tool install` is the install command the README documents
# second. As with pipx we install from the wheel, never
# editable, so packaging bugs surface here and not in user
# reports.
shell: bash
run: |
set -euo pipefail
wheels=(dist/*.whl)
uv tool install "${wheels[0]}"
# uv tool bin dir on Linux/macOS
echo "$HOME/.local/bin" >> "$GITHUB_PATH"
- name: bernstein --version (exit zero)
shell: bash
run: bernstein --version
- name: bernstein --help (exit zero)
shell: bash
run: bernstein --help
- name: Verify packaged resources load via importlib
# Mirror of the pipx job's resource probe but resolved through
# the uv-managed tool venv. See pipx counterpart for rationale.
shell: bash
run: |
set -euo pipefail
UV_TOOL_DIR="$(uv tool dir)"
BERNSTEIN_PYTHON="$UV_TOOL_DIR/bernstein/bin/python"
if [ ! -x "$BERNSTEIN_PYTHON" ]; then
echo "::error::cannot find uv-managed bernstein interpreter at $BERNSTEIN_PYTHON"
exit 1
fi
"$BERNSTEIN_PYTHON" -c '
import importlib
import importlib.resources as ir
cli_mod = importlib.import_module("bernstein.cli.main")
assert callable(getattr(cli_mod, "cli")), "bernstein.cli.main:cli missing"
schema_pkg = ir.files("bernstein.mcp.tool_schemas")
schemas = [p.name for p in schema_pkg.iterdir() if p.name.endswith(".json")]
assert schemas, "no MCP tool schemas shipped in wheel"
tpl_pkg = ir.files("bernstein._default_templates")
assert any(tpl_pkg.iterdir()), "no default templates shipped in wheel"
print(f"packaged resources OK: {len(schemas)} MCP schemas")
'
# ─── CI hardening 2026 (medium, parallel, ≤8 min each) ───────────────
property-tests:
# Hypothesis property suite. PR-time runs the `smoke` profile (50
# examples per property) so every file finishes in well under a
# minute. Catches hash-chain / signature / canonicalisation
# regressions that escape unit tests' fixed inputs.
name: Property tests (Hypothesis smoke)
needs: [lint]
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Run Hypothesis property suite (smoke profile)
env:
HYPOTHESIS_PROFILE: smoke
run: uv run pytest tests/property/ -q --no-cov --timeout=60
snapshot-tests:
# Syrupy snapshot suite. Locks JSONL field order / shape for the
# audit log + lineage record so silent wire-format drift is caught
# before downstream parsers break.
name: Snapshot tests (syrupy)
needs: [lint]
runs-on: ubuntu-latest
timeout-minutes: 5
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Run snapshot tests
run: uv run pytest tests/snapshot/ -q --no-cov
schemathesis-smoke:
# OpenAPI fuzz on the FastAPI task server. Smoke profile fuzzes
# only the critical-surface allow-list (task CRUD, health,
# openapi.json, metrics) with 5 examples per endpoint and the
# `not_a_server_error` check. Heavier sweeps live in nightly.
name: Schemathesis smoke
needs: [lint]
runs-on: ubuntu-latest
# Smoke profile wall-clock runs ~7m30s, which raced the previous
# 8-minute window and got cancelled, failing the CI gate aggregator.
# Widen to 20m for headroom; uv setup is already cached via bootstrap
# and the in-process ASGI schema build is not separately cacheable.
timeout-minutes: 20
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Run Schemathesis smoke profile
env:
SCHEMATHESIS_PROFILE: smoke
BERNSTEIN_AUTH_DISABLED: "1"
run: uv run pytest tests/contract/ -q --no-cov --timeout=30 -p no:warnings
semgrep:
# Project-specific Semgrep rules (.semgrep.yml). ERROR severity
# fails PR; WARNING is advisory (annotation only).
name: Semgrep (custom rules)
needs: [lint]
runs-on: ubuntu-latest
timeout-minutes: 5
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Install Semgrep (isolated env)
# semgrep pins click<8.2 + opentelemetry-sdk<1.38, which conflict
# with our project floors (click>=8.3.3, opentelemetry-sdk>=1.41.1).
# Install in its own venv via `uv tool` (pipx-equivalent) so its
# transitive pins never touch the project resolver.
run: uv tool install semgrep
- name: Run Semgrep (ERROR-only fail gate)
run: |
uv tool run semgrep --config .semgrep.yml --metrics off --severity ERROR --error src/
bandit:
# Bandit static security analyzer. HIGH severity only - there are
# ~30 MEDIUM findings on main that are accepted patterns
# (urlopen with timeout, hardcoded localhost in dev). Fails PR
# only on new HIGH-severity introductions; pre-existing HIGHs are
# captured in `.bandit-baseline.json` and tracked as follow-ups.
name: Bandit (security)
needs: [lint]
runs-on: ubuntu-latest
timeout-minutes: 5
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Run Bandit (HIGH severity gate via baseline)
run: |
# Baseline file `.bandit-baseline.json` captures the 11 known
# HIGH findings on main; the gate below fails only on NEW
# HIGH-severity issues introduced by the PR.
uv run bandit -r src/ --severity-level high \
-b .bandit-baseline.json
pip-audit:
# PyPI CVE scan. Production deps are strict (any vulnerability
# fails); dev deps are advisory (continue-on-error). The free
# PyPI advisory DB is updated daily.
name: pip-audit (deps)
needs: [lint]
runs-on: ubuntu-latest
timeout-minutes: 8
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Export production requirements (no project, hashed)
# Audit the resolved lockfile without including bernstein itself.
# `pip-audit` chokes on editable distributions even with
# `--skip-editable` under `--strict`, so we feed it a flat
# requirements file generated from `uv.lock`.
run: uv export --no-dev --no-emit-project --format requirements-txt -o /tmp/req-prod.txt
- name: Export dev requirements (no project, hashed)
run: uv export --no-emit-project --format requirements-txt -o /tmp/req-dev.txt
- name: Production deps (strict)
# `--no-deps` is safe: the exported file is fully pinned and the
# transitive closure is resolved by `uv.lock`. `--disable-pip`
# avoids spinning up a sub-venv just to run `pip install`.
#
# `--ignore-vuln PYSEC-2025-183` (CVE-2025-45768): disputed advisory
# against pyjwt that affects all released versions (introduced at 0,
# no fix version published) and is pulled in transitively via `mcp`.
# The maintainer disputes it because the key length is chosen by the
# calling application, not the library. There is nothing to upgrade
# to, so it is ignored with the rationale recorded here.
run: uv run pip-audit -r /tmp/req-prod.txt --strict --disable-pip --no-deps --ignore-vuln PYSEC-2025-183
- name: Dev deps (advisory)
continue-on-error: true
run: uv run pip-audit -r /tmp/req-dev.txt --strict --disable-pip --no-deps --ignore-vuln PYSEC-2025-183
beartype:
# Runtime type-check enforcement via beartype.claw. Imports the
# public APIs in core.security / core.agents / core.protocols.cluster
# under @beartype and runs the unit tests for those modules so any
# type contract violation surfaces as a test failure.
name: Beartype (type contracts)
needs: [lint]
runs-on: ubuntu-latest
timeout-minutes: 15
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Run lineage-signer tests under beartype claw
# The beartype-claw allow-list is in tests/_beartype_claw.py.
# Today only ``bernstein.core.persistence.lineage_signer`` is
# in the strict zone - widen as more modules reach
# beartype-clean status.
env:
BEARTYPE_USE_CLAW: enable
run: |
uv run pytest tests/unit/ -q --no-cov --timeout=120 \
-k 'lineage_signer or lineage_record or lineage_export'
mutmut-diff:
# Mutation testing report on PR-changed files only. Computes a
# mutation score over the lines actually touched in this PR.
# Advisory until the command is allowed to fail this job.
name: Mutation report (diff-only)
needs: [lint]
runs-on: ubuntu-latest
timeout-minutes: 20
if: github.event_name == 'pull_request'
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
fetch-depth: 0
- uses: ./.github/actions/bootstrap
- name: Compute changed src/ files
id: diff
env:
BASE_REF: ${{ github.base_ref }}
run: |
CHANGED=$(git diff --name-only \
"origin/${BASE_REF}...HEAD" \
| grep '^src/.*\.py$' | tr '\n' ',' | sed 's/,$//')
if [ -z "$CHANGED" ]; then
echo "no Python files changed; skipping mutation step"
echo "skip=true" >> "$GITHUB_OUTPUT"
exit 0
fi
echo "paths=$CHANGED" >> "$GITHUB_OUTPUT"
echo "Mutating: $CHANGED"
- name: Run mutmut on changed files
if: steps.diff.outputs.skip != 'true'
continue-on-error: true # advisory - score reported, not enforced
env:
DIFF_PATHS: ${{ steps.diff.outputs.paths }}
run: |
uv sync --group dev
# mutmut 3.x reads paths_to_mutate from pyproject.toml /
# mutmut_config.py instead of CLI; we rewrite the config to
# the diff paths for this run only, then revert.
cp mutmut_config.py mutmut_config.py.bak
PATHS=$(echo "${DIFF_PATHS}" | tr ',' '\n' | sed 's/.*/ "&",/')
{
echo "paths_to_mutate = ["
echo "$PATHS"
echo "]"
echo "test_command = \"python -m pytest tests/unit/ -x -q --no-header --override-ini=addopts=\""
echo "tests_dir = \"tests/unit/\""
} > mutmut_config.py
uv run mutmut run || true
mv mutmut_config.py.bak mutmut_config.py
uv run mutmut results || true
diff-coverage:
# LEVEL 1 of the coverage ratchet - diff-cover report: lines touched in
# this PR are compared with the committed diff-coverage floor. The floor lives
# in .coverage-baseline.json (key: diff_coverage_floor_percent) so it
# is a single source of truth that the weekly bump workflow nudges up
# over time (see docs/operations/coverage-ratchet.md). Reuses the
# coverage.xml uploaded by the main test job.
#
# ADVISORY: the diff-cover step is continue-on-error, so this report's
# result stays `success` even when diff coverage is below the floor.
# The report is outside the CI-gate `needs` set until PR coverage
# artifacts are reliable enough for a blocking threshold.
name: Diff coverage report
needs: [test]
runs-on: ubuntu-latest
timeout-minutes: 5
if: github.event_name == 'pull_request'
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
fetch-depth: 0
- uses: ./.github/actions/bootstrap
- name: Download coverage report
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
name: coverage-report
continue-on-error: true # main may not have generated coverage on the PR's commit
- name: Resolve diff-coverage floor from baseline
id: floor
# Single source of truth: the floor the weekly bump nudges up.
# Falls back to 80 if the baseline file is somehow absent so a
# missing file never silently disables the gate.
run: |
if [ -f .coverage-baseline.json ]; then
floor=$(uv run python scripts/coverage_ratchet.py show-floor \
--baseline .coverage-baseline.json)
else
echo "::warning::.coverage-baseline.json missing; defaulting diff floor to 80"
floor=80
fi
echo "value=${floor}" >> "$GITHUB_OUTPUT"
echo "Diff-coverage floor: ${floor}%"
- name: Run diff-cover
continue-on-error: true # advisory until all PRs reliably have coverage.xml
env:
BASE_REF: ${{ github.base_ref }}
FLOOR: ${{ steps.floor.outputs.value }}
run: |
if [ -f coverage.xml ]; then
uv run diff-cover coverage.xml \
--compare-branch="origin/${BASE_REF}" \
--fail-under="${FLOOR}" \
--markdown-report diff-coverage.md
cat diff-coverage.md >> "$GITHUB_STEP_SUMMARY" || true
else
echo "::warning::No coverage.xml found - skipping diff-cover"
fi
pyright-strict-zone:
# Pyright strict mode against the security and protocols.cluster
# subtrees. The repo-wide pyright run stays advisory (basic mode
# via tool.pyright); the strict zone fails PR on any new error
# in the listed packages.
name: Pyright strict (security + cluster)
needs: [lint]
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
- name: Run pyright strict on the curated allow-list
# The strict-zone allow-list is in pyrightconfig.strict.json. The
# repo-wide pyright run uses pyproject.toml [tool.pyright]. Add
# files to the allow-list as they reach strict cleanliness.
run: uv run pyright --project pyrightconfig.strict.json
# ─── Slow checks (cancel old runs, 15-45 min) ─────────────────────────
adapter-integration:
# End-to-end adapter tests against the fake-CLI harness in
# tests/integration/fake_cli. Spawns real subprocesses (no Popen
# mocks) so PATH-resolution, env filtering, exit-code mapping, and
# output capture are exercised against actual fork/exec. Skipped on
# Windows because the wrappers are POSIX shell scripts and the
# adapters use ``start_new_session=True``; unit tests cover the same
# argv/env logic on Windows via mocked Popen.
#
# macOS coverage moved to the adapter-integration-macos job below
# (gated by determine-changes.outputs.macos_sensitive, the
# `macos-needed` label, or push events) to relieve the hosted macOS
# runner pool during burst-merge waves - see #1468. The
# ci-macos-nightly.yml workflow runs the full macOS matrix daily as
# the safety-net for regressions that slip past the path gate.
name: Adapter integration (fake-CLI)
needs: [lint]
runs-on: ubuntu-latest
timeout-minutes: 15
permissions:
contents: read
strategy:
fail-fast: false
matrix:
python-version: ["3.13"]
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
with:
python-version: ${{ matrix.python-version }}
- name: Run fake-CLI adapter integration tests
run: uv run pytest tests/integration/test_adapter_e2e.py -x -q --timeout=60
adapter-integration-macos:
# macOS half of adapter-integration. Gated on three conditions (see
# #1468):
# 1. push events (incl. merges to main) always run macOS so the
# release-train sees a fresh signal on every commit that
# reaches main;
# 2. PRs whose diff touches macOS-sensitive paths (the planner
# sets macos_sensitive=true);
# 3. PRs that carry the `macos-needed` label (operator opt-in for
# cross-platform work that the path filter cannot detect).
# Otherwise this job is skipped on PRs and ci-macos-nightly.yml
# provides the safety net.
name: Adapter integration (fake-CLI, macOS)
needs: [lint, determine-changes]
if: >-
github.event_name == 'push' ||
needs.determine-changes.outputs.macos_sensitive == 'true' ||
contains(github.event.pull_request.labels.*.name, 'macos-needed')
runs-on: macos-latest
timeout-minutes: 15
permissions:
contents: read
strategy:
fail-fast: false
matrix:
python-version: ["3.13"]
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
with:
python-version: ${{ matrix.python-version }}
- name: Run fake-CLI adapter integration tests
run: uv run pytest tests/integration/test_adapter_e2e.py -x -q --timeout=60
test:
# Sharded across 4 parallel runners per (os, python) cell. The unit
# suite runs each of ~1.4k test files in its own subprocess (the
# OOM-avoidance model documented in the coverage step below); per-file
# Python startup + full-package import is a fixed ~2.7s/file, so at
# 1.4k files / 4 local workers a single runner spent 25+ min purely on
# startup churn (the file count crossed a threshold when discovery
# widened to rglob). Fanning the file list out over TEST_SHARD_COUNT
# runners (each runs `run_tests.py --shard i/N`, a deterministic
# disjoint slice) cuts each runner to ~1/4 of the files. The matrix
# `.result` ci-gate reads is `failure` if ANY shard cell fails, so all
# shards are still required - no shard can silently skip.
name: Test (${{ matrix.os }}, Python ${{ matrix.python-version }}, shard ${{ matrix.shard }})
needs: [lint] # fast-fail: don't waste 45min if lint fails
runs-on: ${{ matrix.os }}
# Per-shard the unit suite is ~1/4 of the old wall time. Push-time
# coverage is collected during the existing ubuntu/3.13 shard runs and
# combined by coverage-report below, so no shard performs a serial
# full-suite coverage rerun.
timeout-minutes: 90
permissions:
contents: read
env:
# Single source of truth for the shard count. The `--shard i/N`
# denominator below and the per-shard slice both key off this; bump
# it (and the `shard:` list) together to rescale the fan-out.
TEST_SHARD_COUNT: "4"
# Main push coverage runs are slower than local file runs; keep the
# per-file guard, but allow heavy adapter contract files to finish.
BERNSTEIN_TEST_FILE_TIMEOUT_SECONDS: "600"
strategy:
fail-fast: false
matrix:
# macOS removed from the default matrix to relieve hosted-runner
# saturation during burst-merge waves (see #1468). The test-macos
# job below runs the same suite on macOS when the diff is
# macOS-sensitive, the PR carries the `macos-needed` label, or
# the event is a push. ci-macos-nightly.yml provides a daily
# safety-net run of the full macOS matrix.
os: [ubuntu-latest, windows-latest]
python-version: ["3.12", "3.13"]
# Fan the per-file suite out across 4 deterministic shards per
# cell. Keep this list in sync with TEST_SHARD_COUNT above.
shard: [1, 2, 3, 4]
exclude:
# Coverage/JUnit upload only on ubuntu; skip duplicate slow jobs on Windows for 3.12
- os: windows-latest
python-version: "3.12"
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
with:
python-version: ${{ matrix.python-version }}
- name: Fetch base ref for impacted-test selection
if: github.event_name == 'pull_request' && runner.os != 'Windows'
env:
BASE_REF: ${{ github.base_ref }}
run: |
git fetch --no-tags --depth=1 origin \
"refs/heads/${BASE_REF}:refs/remotes/origin/${BASE_REF}"
- name: Run isolated test suite (Linux/macOS)
if: runner.os != 'Windows'
env:
BASE_REF: ${{ github.base_ref }}
EVENT_NAME: ${{ github.event_name }}
PYTHON_VERSION: ${{ matrix.python-version }}
SHARD: ${{ matrix.shard }}
SHARD_COUNT: ${{ env.TEST_SHARD_COUNT }}
run: |
# `--shard i/N` runs a deterministic disjoint slice of the
# discovered (and, on PRs, affected) file list. On PRs the
# affected set is sharded too, so each runner runs ~1/N of the
# impacted files.
coverage_args=()
if [ "${EVENT_NAME}" = "push" ] && [ "${RUNNER_OS}" = "Linux" ] && [ "${PYTHON_VERSION}" = "3.13" ]; then
coverage_args=(--coverage)
fi
if [ "${EVENT_NAME}" = "pull_request" ] && \
git rev-parse --verify "refs/remotes/origin/${BASE_REF}" >/dev/null 2>&1; then
uv run python scripts/run_tests.py --parallel 4 \
"${coverage_args[@]}" \
--shard "${SHARD}/${SHARD_COUNT}" \
--affected "refs/remotes/origin/${BASE_REF}"
else
uv run python scripts/run_tests.py --parallel 4 \
"${coverage_args[@]}" \
--shard "${SHARD}/${SHARD_COUNT}"
fi
- name: Run isolated test suite (Windows)
if: runner.os == 'Windows'
continue-on-error: true # Windows has Unix-only tests (chmod, SIGKILL) that are skipped but some may remain
shell: pwsh
env:
SHARD: ${{ matrix.shard }}
SHARD_COUNT: ${{ env.TEST_SHARD_COUNT }}
run: uv run python scripts/run_tests.py -x --parallel 4 --shard "${env:SHARD}/${env:SHARD_COUNT}"
- name: Run capability-matrix spawn-refusal integration tests (Linux/macOS)
# Pinned to shard 1 so this runs exactly once per (os, python) cell
# rather than once per shard - the unit suite is sharded, this
# integration probe is not. Lethal-trifecta integration coverage:
# every supported OS/Python cell exercises the AgentSpawner
# spawn-refusal path so we catch regressions in the structural rule
# before they ship. See
# tests/integration/test_capability_matrix_spawn_refusal.py.
if: runner.os != 'Windows' && matrix.shard == 1
run: |
uv run pytest tests/integration/test_capability_matrix_spawn_refusal.py \
-x -q --tb=short --timeout=120
- name: Run capability-matrix spawn-refusal integration tests (Windows)
# Pinned to shard 1 (see Linux/macOS counterpart): one run per cell.
if: runner.os == 'Windows' && matrix.shard == 1
shell: pwsh
run: |
uv run pytest tests/integration/test_capability_matrix_spawn_refusal.py `
-x -q --tb=short --timeout=120
- name: Prepare coverage shard artifact
if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13' && github.event_name == 'push'
run: |
if [ ! -f .coverage ]; then
echo "::error::.coverage was not generated for shard ${{ matrix.shard }}"
exit 1
fi
mv .coverage ".coverage.${{ matrix.shard }}"
- name: Upload coverage shard artifact
if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.13' && github.event_name == 'push'
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: coverage-data-${{ matrix.shard }}
path: .coverage.${{ matrix.shard }}
if-no-files-found: error
include-hidden-files: true
retention-days: 1
coverage-report:
name: Coverage report
needs: [test]
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
timeout-minutes: 10
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
with:
python-version: "3.13"
- name: Download coverage shard artifacts
uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
with:
pattern: coverage-data-*
path: coverage-shards
merge-multiple: true
- name: Merge coverage shards
run: |
shard_count=$(find coverage-shards -name '.coverage.*' -type f | wc -l | tr -d ' ')
if [ "$shard_count" -lt 4 ]; then
echo "::error::expected 4 coverage shard files, found $shard_count"
find coverage-shards -maxdepth 2 -type f -print
exit 1
fi
uv run python -m coverage combine coverage-shards/.coverage.*
uv run python -m coverage xml --ignore-errors -o coverage.xml
test -s coverage.xml
- name: Upload coverage report artifact
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: coverage-report
path: coverage.xml
if-no-files-found: error
retention-days: 1
- name: Upload coverage to Codecov
uses: codecov/codecov-action@fb8b3582c8e4def4969c97caa2f19720cb33a72f # v7.0.0
with:
files: coverage.xml
fail_ci_if_error: false
token: ${{ secrets.CODECOV_TOKEN }}
test-macos:
# macOS half of the test matrix, split out for #1468 (macOS hosted
# runner saturation). Runs unconditionally on push (so every commit
# that reaches main has a fresh macOS signal), and on PRs only when
# one of the gate conditions is met:
# - the planner classified the diff as macos_sensitive (touched
# a module with `sys.platform == "darwin"` branches, the
# tunnels driver layer, the daemon installer, the runtime
# state code, the macOS clipboard/notifications wrappers, or
# ci.yml / ci-macos-nightly.yml themselves);
# - the PR carries the `macos-needed` label (operator opt-in for
# cross-platform work the path filter cannot detect).
# Otherwise this job is skipped on the PR and ci-macos-nightly.yml
# provides the safety-net coverage at 06:00 UTC each day.
#
# Coverage / JUnit / Codecov uploads are NOT mirrored here; they
# remain ubuntu-only (matches the previous matrix gating).
#
# macOS sharding decision (this PR): the ubuntu `test` job fans out
# across a `shard` matrix dimension, but macOS CANNOT - this job's
# `name:` MUST stay the literal string below (branch-protection
# required-context + required-check-canary.yml both pin it; a matrix
# `shard` dimension would template the name and break the lock). So
# instead of sharding macOS, we shrink its per-push workload to a
# single deterministic quarter of the file list (`--shard 1/MACOS
# _PUSH_SHARD_COUNT`). This fits the time budget without the 90-min
# wall the full macOS suite hit, keeps a real (deterministic, ~1/4)
# macOS signal on every commit that lands on main, and leaves
# ci-macos-nightly.yml to run the FULL macOS matrix as the safety
# net. On PRs the job already runs `--affected` (impacted slice), so
# macos_sensitive PRs still exercise exactly the touched code on macOS.
# The merge queue (merge_group) skips this job entirely (see the
# ci-gate MACOS_SKIP_EVENTS handling) - the post-merge push is what
# carries the macOS signal.
#
# Literal job name -- NOT templated. The branch-protection required
# context for macOS test runs depends on this exact string; when the
# job is skipped via the `if:` gate, GitHub posts the templated form
# verbatim, which never matches a required-context rule. The literal
# form keeps the name resolvable in every state (success, fail, skip).
# required-check-canary.yml asserts this remains literal.
name: Test (macos-latest, Python 3.13)
needs: [lint, determine-changes]
if: >-
github.event_name == 'push' ||
needs.determine-changes.outputs.macos_sensitive == 'true' ||
contains(github.event.pull_request.labels.*.name, 'macos-needed')
runs-on: macos-latest
timeout-minutes: 90
permissions:
contents: read
env:
# Per-push macOS runs only shard 1 of this many - a deterministic
# ~1/4 subset of the file list. ci-macos-nightly.yml runs the full
# matrix daily as the safety net.
MACOS_PUSH_SHARD_COUNT: "4"
strategy:
fail-fast: false
matrix:
python-version: ["3.13"]
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- uses: ./.github/actions/bootstrap
with:
python-version: ${{ matrix.python-version }}
- name: Fetch base ref for impacted-test selection
if: github.event_name == 'pull_request'
env:
BASE_REF: ${{ github.base_ref }}
run: |
git fetch --no-tags --depth=1 origin \
"refs/heads/${BASE_REF}:refs/remotes/origin/${BASE_REF}"
- name: Run isolated test suite
env:
BASE_REF: ${{ github.base_ref }}
EVENT_NAME: ${{ github.event_name }}
MACOS_PUSH_SHARD_COUNT: ${{ env.MACOS_PUSH_SHARD_COUNT }}
run: |
# On PRs: run only the affected slice (impacted by the diff).
# On push to main: run a single deterministic quarter of the
# file list (`--shard 1/N`) so the macOS cell fits its time
# budget; ci-macos-nightly.yml runs the full matrix daily.
if [ "${EVENT_NAME}" = "pull_request" ] && \
git rev-parse --verify "refs/remotes/origin/${BASE_REF}" >/dev/null 2>&1; then
uv run python scripts/run_tests.py --parallel 4 --affected "refs/remotes/origin/${BASE_REF}"
else
uv run python scripts/run_tests.py --parallel 4 \
--shard "1/${MACOS_PUSH_SHARD_COUNT}"
fi
- name: Run capability-matrix spawn-refusal integration tests
run: |
uv run pytest tests/integration/test_capability_matrix_spawn_refusal.py \
-x -q --tb=short --timeout=120
# ─── Post-pipeline (conditional, never cancelled) ──────────────────────
autofix:
name: Auto-fix lint
runs-on: ubuntu-latest
needs: [lint, repo-hygiene]
if: github.event_name == 'push' && github.ref == 'refs/heads/main' && github.actor != 'bernstein[bot]' && github.actor != 'bernstein-orchestrator[bot]'
continue-on-error: true
timeout-minutes: 15
permissions:
contents: write
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
fetch-depth: 5
- name: Check for autofix loop
id: loop_check
run: |
RECENT=$(git log --oneline -3 --format='%s' | grep -c "style: auto-fix" || true)
if [ "$RECENT" -ge 3 ]; then
echo "::warning::Autofix loop detected - last 3 commits are all auto-fix. Skipping."
echo "skip=true" >> "$GITHUB_OUTPUT"
exit 0
fi
- if: steps.loop_check.outputs.skip != 'true'
uses: ./.github/actions/bootstrap
- name: Auto-fix ruff
if: steps.loop_check.outputs.skip != 'true'
run: |
uv run ruff check src/ --fix --unsafe-fixes || true
uv run ruff format src/
- name: Purge tracked .sdd files
if: steps.loop_check.outputs.skip != 'true'
run: |
TRACKED="$(git ls-files '.sdd')"
if [ -n "$TRACKED" ]; then
echo "$TRACKED" | xargs git rm --cached --
fi
- name: Commit and push fixes
if: steps.loop_check.outputs.skip != 'true'
run: |
git add src/ tests/ scripts/
git diff --cached --quiet && echo "Nothing to fix" && exit 0
git config user.name "bernstein[bot]"
git config user.email "bernstein-bot@users.noreply.github.com"
git commit -m "style: auto-fix ruff lint and format"
git push
close-ci-issues:
name: Close resolved CI issues
runs-on: ubuntu-latest
needs: [ci-gate]
if: >
success() &&
needs.ci-gate.result == 'success' &&
github.ref == 'refs/heads/main' &&
github.event_name == 'push'
timeout-minutes: 5
permissions:
contents: read
issues: write
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7
with:
persist-credentials: false
- name: Close ci-fix issues
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
GATE_RESULT="${{ needs.ci-gate.result }}"
RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
ISSUES=$(gh issue list --label ci-fix --state open --json number --jq '.[].number' 2>/dev/null || echo "")
for NUM in $ISSUES; do
gh issue close "$NUM" --comment "CI aggregate gate result: ${GATE_RESULT}. Run: ${RUN_URL}. Commit: \`${{ github.sha }}\`." || true
done
# Note: the previous `self-heal` issue-creating job was superseded by the
# `bernstein-ci-fix` workflow (.github/workflows/bernstein-ci-fix.yml),
# which on a CI failure first attempts an auto-heal PR and only falls
# back to opening a `ci-fix` issue when no diff is produced.
pr-summary:
name: PR CI summary
runs-on: ubuntu-latest
if: github.event_name == 'pull_request'
needs:
- repo-hygiene
- lint
- typecheck
- test
- test-macos
- spelling
- dead-code
- actionlint
- dist-size
- property-tests
- snapshot-tests
- schemathesis-smoke
- semgrep
- bandit
- pip-audit
- beartype
- pyright-strict-zone
timeout-minutes: 5
permissions:
pull-requests: write
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- uses: actions/github-script@3a2844b7e9c422d3c10d287c895573f7108da1b3 # v9.0.0
with:
script: |
const marker = '<!-- ci-summary -->';
const jobs = [
{ name: 'Repo hygiene', result: '${{ needs.repo-hygiene.result }}' },
{ name: 'Lint', result: '${{ needs.lint.result }}' },
{ name: 'Type check report', result: '${{ needs.typecheck.result }}' },
{ name: 'Tests', result: '${{ needs.test.result }}' },
{ name: 'Tests (macOS)', result: '${{ needs.test-macos.result }}' },
{ name: 'Spelling', result: '${{ needs.spelling.result }}' },
{ name: 'Dead code', result: '${{ needs.dead-code.result }}' },
{ name: 'Workflow lint', result: '${{ needs.actionlint.result }}' },
{ name: 'Dist size', result: '${{ needs.dist-size.result }}' },
{ name: 'Property (Hypothesis)', result: '${{ needs.property-tests.result }}' },
{ name: 'Snapshot (syrupy)', result: '${{ needs.snapshot-tests.result }}' },
{ name: 'Schemathesis smoke', result: '${{ needs.schemathesis-smoke.result }}' },
{ name: 'Semgrep custom', result: '${{ needs.semgrep.result }}' },
{ name: 'Bandit', result: '${{ needs.bandit.result }}' },
{ name: 'pip-audit', result: '${{ needs.pip-audit.result }}' },
{ name: 'Beartype', result: '${{ needs.beartype.result }}' },
{ name: 'Pyright strict zone', result: '${{ needs.pyright-strict-zone.result }}' },
];
const icon = (r) => r === 'success' ? '✅' : r === 'failure' ? '❌' : r === 'skipped' ? '⏭️' : '⚠️';
let body = `${marker}\n### CI Summary\n\n`;
body += '| Check | Result |\n|-------|--------|\n';
for (const j of jobs) {
body += `| ${j.name} | ${icon(j.result)} ${j.result} |\n`;
}
body += '\nCoverage and detailed reports are available via Codecov and the Checks tab.';
const { data: comments } = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const existing = comments.find(c => c.body.includes(marker));
if (existing) {
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existing.id,
body,
});
} else {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body,
});
}
# ─── Aggregator gate (closes #1273) ────────────────────────────────────
#
# SINGLE required status check for branch protection.
#
# Why this exists:
# * `ci.yml` has 20+ jobs (incl. an OS x Python matrix that fans out
# `test (ubuntu-latest, 3.12)` etc.) - listing every contextual name
# in branch-protection is fragile because matrix names drift with
# each matrix change.
# * A `skipped` job auto-passes branch protection. So does a job that
# was never enqueued because an upstream `needs:` was `cancelled`.
# Either condition can let a red commit reach `main`.
#
# This job fails on ANY non-success result - including `cancelled`,
# `timed_out`, `action_required`, or a job that never ran. `skipped`
# passes ONLY when the planner job (``determine-changes``) classified
# the diff in a way that makes the skip intentional (docs-only PRs,
# event-gated jobs, etc.). A `skipped` result for a job that should
# have run is treated as a failure - pattern borrowed from pypa/pip's
# CI aggregator (see PR #1273 for the discussion).
#
# Operator: after this PR merges, replace every required check in
# branch protection with this single context (name shown in UI:
# ``CI gate``). The exact `gh api` invocation is in PR #1273's body.
#
# Excluded from `needs:`:
# * `autofix` - runs only on push to main, mutates the tree
# * `close-ci-issues` - post-gate issue update, must not gate merges
# * `pr-summary` - cosmetic PR comment, must not gate merges
# * `typecheck` - advisory report while repo-wide pyright is being typed
# * `mutmut-diff` - advisory report until mutation score enforcement is enabled
# * `diff-coverage` - advisory report until PR coverage artifacts are reliable
# * `ci-gate` itself - would deadlock the dependency graph
ci-gate:
name: CI gate
runs-on: ubuntu-latest
# `always()` ensures the gate fires even when an upstream job is
# `cancelled`. `!cancelled()` lets the gate itself be cancelled when
# the user cancels the whole workflow run (so we don't try to
# report "cancelled" on a manually aborted run).
if: always() && !cancelled()
needs:
- determine-changes
- repo-hygiene
- lint
- spelling
- actionlint
- lineage-gate
- dead-code
- dist-size
- install-smoke-pipx
- install-smoke-uv
- property-tests
- snapshot-tests
- schemathesis-smoke
- semgrep
- bandit
- pip-audit
- beartype
- pyright-strict-zone
- adapter-integration
- adapter-integration-macos
- test
- coverage-report
- test-macos
timeout-minutes: 3
permissions:
contents: read
steps:
- name: Harden runner (audit mode)
uses: step-security/harden-runner@9af89fc71515a100421586dfdb3dc9c984fbf411 # v2.19.4
with:
egress-policy: audit
- id: roll-up
name: Roll up needs.*.result with conditional allowed-skips
env:
NEEDS_JSON: ${{ toJSON(needs) }}
PLAN_JSON: ${{ toJSON(needs.determine-changes.outputs) }}
EVENT_NAME: ${{ github.event_name }}
run: |
# Write JSONs to disk so the python heredoc can read them
# without worrying about shell quoting on multiline content.
printf '%s' "$NEEDS_JSON" > results.json
printf '%s' "$PLAN_JSON" > plan.json
python3 - <<'PY'
import json, os, sys
data = json.load(open("results.json"))
plan = json.load(open("plan.json"))
event = os.environ.get("EVENT_NAME", "")
# Jobs that intentionally skip on docs-only changes (no python
# / tests / workflows touched). Mirrors paths-ignore at the top.
DOCS_ONLY_SKIPPABLE = {
"test",
"schemathesis-smoke",
"snapshot-tests",
"property-tests",
"beartype",
"adapter-integration",
"pyright-strict-zone",
"semgrep",
"bandit",
"pip-audit",
"dead-code",
"dist-size",
"install-smoke-pipx",
"install-smoke-uv",
}
# macOS-gated jobs (see #1468): on PRs these are skipped unless
# the diff is macos_sensitive or the PR carries the
# `macos-needed` label. Always run on push events.
MACOS_GATED = {"test-macos", "adapter-integration-macos"}
# Push-only jobs are required on native main pushes but skip
# intentionally on PR, workflow_dispatch, and merge_group runs.
PUSH_ONLY = {"coverage-report"}
# Events under which a macOS-gated skip is intentional. PRs and
# manual dispatch already gate macOS behind macos_sensitive /
# label. merge_group must be included too: a merge queue runs CI
# on a synthetic merge_group ref where `github.event.pull_request`
# is null, so the `macos-needed` label and `push`-only branches of
# the job `if:` can never be true and these jobs always skip. The
# merged commit still triggers a native `push` to main that runs
# the full macOS suite un-gated, and ci-macos-nightly.yml covers
# regression -- so tolerating the skip here is what keeps the queue
# from wedging without losing macOS coverage on what actually lands.
MACOS_SKIP_EVENTS = ("pull_request", "workflow_dispatch", "merge_group")
docs_only = plan.get("docs_only") == "true"
macos_sensitive = plan.get("macos_sensitive") == "true"
# Read the PR labels via the event payload. The aggregator
# runs inside ci.yml so the same event.pull_request.labels
# array used by the job `if:` is available here too. The
# GITHUB_EVENT_PATH file is the canonical source.
macos_labelled = False
try:
with open(os.environ["GITHUB_EVENT_PATH"]) as fh:
payload = json.load(fh)
labels = (payload.get("pull_request") or {}).get("labels") or []
macos_labelled = any(
(lbl.get("name") == "macos-needed") for lbl in labels
)
except Exception:
macos_labelled = False
bad = []
for name, info in data.items():
r = info.get("result")
if r == "success":
continue
if r == "skipped":
if name in PUSH_ONLY and event != "push":
continue
if docs_only and name in DOCS_ONLY_SKIPPABLE:
continue
# macOS-gated jobs are allowed to skip on:
# - PRs when diff is not macos_sensitive AND no
# `macos-needed` label.
# - workflow_dispatch (manual re-runs from hotfix
# agents) when diff is not macos_sensitive.
# - merge_group (merge-queue ref) where the label/push
# branches of the job `if:` cannot be true; the
# post-merge push to main runs macOS un-gated.
# On native push events macOS must run.
# Nightly ci-macos-nightly.yml covers regression.
if (
name in MACOS_GATED
and event in MACOS_SKIP_EVENTS
and not macos_sensitive
and not macos_labelled
):
continue
bad.append((name, r))
if bad:
print("::error::CI gate FAILED - these jobs were not success "
"and not intentionally skipped:")
for n, r in bad:
print(f" - {n}: result={r}")
sys.exit(1)
print(f"CI gate: all required jobs passed "
f"(or intentionally skipped). docs_only={docs_only}, event={event}")
for n, info in sorted(data.items()):
print(f" {n}: {info.get('result')}")
PY