E2E Nightly #37
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # E2E Nightly Workflow (Loop E2E-L10 of #91, issue #100). | |
| # | |
| # Runs the full 50-scenario adversarial corpus once per night and | |
| # auto-PRs a markdown report. The diff vs the previous report is the | |
| # primary value; the per-run roll-up is context. | |
| # | |
| # Today the upstream is the in-process FastAPI mock (matching the | |
| # PR-gate workflow). When `LLMTRACE_E2E_REAL_UPSTREAM_URL` is wired up | |
| # as a repo secret, the conftest will use it instead of the mock so | |
| # the nightly exercises a real LLM. Cost cap (`--cost-cap-usd`) gates | |
| # the session in either mode. | |
| name: E2E Nightly | |
| on: | |
| # 03:00 UTC daily — well outside US/EU business hours so a flaky | |
| # run doesn't stall a PR review. | |
| schedule: | |
| - cron: "0 3 * * *" | |
| workflow_dispatch: | |
| inputs: | |
| cost_cap_usd: | |
| description: "Per-session cost cap in USD (default: 2.0)" | |
| required: false | |
| default: "2.0" | |
| # Only one nightly at a time — no benefit to parallel runs since the | |
| # state we care about (the committed report) is shared. | |
| concurrency: | |
| group: e2e-nightly | |
| cancel-in-progress: false | |
| permissions: | |
| contents: write | |
| pull-requests: write | |
| jobs: | |
| e2e-nightly: | |
| name: E2E Nightly | |
| runs-on: ubuntu-latest | |
| # 1-hour hard cap per L10 acceptance. The full corpus is ~12 min | |
| # against the mock; budget for real-LLM latency + retries. | |
| timeout-minutes: 60 | |
| steps: | |
| - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 | |
| - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 | |
| with: | |
| python-version: "3.12" | |
| - name: Install Python deps | |
| run: python3 -m pip install -r requirements-e2e.txt | |
| - uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable | |
| - name: Install protoc | |
| run: sudo apt-get update && sudo apt-get install -y protobuf-compiler | |
| - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5 | |
| with: | |
| path: | | |
| ~/.cargo/bin/ | |
| ~/.cargo/registry/index/ | |
| ~/.cargo/registry/cache/ | |
| ~/.cargo/git/db/ | |
| target/ | |
| key: ${{ runner.os }}-cargo-e2e-nightly-${{ hashFiles('**/Cargo.lock') }} | |
| restore-keys: | | |
| ${{ runner.os }}-cargo-e2e-nightly- | |
| - name: Build llmtrace-proxy (release) | |
| run: cargo build --release --manifest-path crates/llmtrace-proxy/Cargo.toml | |
| - name: Run full corpus | |
| env: | |
| # Optional real-upstream override; conftest falls back to the | |
| # in-process mock when unset so this workflow runs unchanged | |
| # whether or not the secret is configured. | |
| LLMTRACE_E2E_REAL_UPSTREAM_URL: ${{ secrets.LLMTRACE_E2E_REAL_UPSTREAM_URL }} | |
| LLMTRACE_E2E_REAL_UPSTREAM_MODEL: ${{ secrets.LLMTRACE_E2E_REAL_UPSTREAM_MODEL }} | |
| # Optional auth for real-upstream runs. Prefer the raw API-key | |
| # secret; use the Authorization secret only for providers that | |
| # require a non-Bearer scheme or additional header parameters. | |
| LLMTRACE_E2E_REAL_UPSTREAM_API_KEY: ${{ secrets.LLMTRACE_E2E_REAL_UPSTREAM_API_KEY }} | |
| LLMTRACE_E2E_REAL_UPSTREAM_AUTHORIZATION: ${{ secrets.LLMTRACE_E2E_REAL_UPSTREAM_AUTHORIZATION }} | |
| LLMTRACE_JUDGE_OPENAI_API_KEY: ${{ secrets.LLMTRACE_JUDGE_OPENAI_API_KEY }} | |
| # LLM-backed upstream-fell-for-it judge (#123). Auto-enabled | |
| # when MOONSHOT_API_KEY is configured as a repo secret. Without | |
| # it the harness keeps the regex baseline so the nightly never | |
| # breaks on a missing secret. Calibration evidence committed | |
| # at docs/research/results/upstream_judge_calibration_kimi-k2-6_2026-04-28.md | |
| # — kimi-k2.6 hit 12/12 = 100% vs regex's 10/12 = 83.3% at | |
| # ~$0.0016/call, so per-session cap of $0.50 is a safety margin | |
| # not a budget. | |
| MOONSHOT_API_KEY: ${{ secrets.MOONSHOT_API_KEY }} | |
| LLMTRACE_E2E_UPSTREAM_JUDGE_BACKEND: openai | |
| LLMTRACE_E2E_UPSTREAM_JUDGE_MODEL: kimi-k2.6 | |
| LLMTRACE_E2E_UPSTREAM_JUDGE_BASE_URL: https://api.moonshot.ai/v1 | |
| LLMTRACE_E2E_UPSTREAM_JUDGE_API_KEY_ENV: MOONSHOT_API_KEY | |
| LLMTRACE_E2E_UPSTREAM_JUDGE_COST_CAP_USD: "0.50" | |
| run: | | |
| mkdir -p out | |
| # Defensive gate: only flip to the LLM judge when its | |
| # credential is present. RegexUpstreamJudge stays the | |
| # fallback so the workflow degrades gracefully when no | |
| # secret is configured. | |
| if [ -n "${MOONSHOT_API_KEY:-}" ]; then | |
| export LLMTRACE_E2E_UPSTREAM_JUDGE=llm | |
| echo "::notice title=Upstream judge::LLMUpstreamJudge (backend=${LLMTRACE_E2E_UPSTREAM_JUDGE_BACKEND}, model=${LLMTRACE_E2E_UPSTREAM_JUDGE_MODEL}, cap=\$${LLMTRACE_E2E_UPSTREAM_JUDGE_COST_CAP_USD})" | |
| else | |
| echo "::notice title=Upstream judge::RegexUpstreamJudge (MOONSHOT_API_KEY secret not set; LLM backend skipped)" | |
| fi | |
| # `|| true` so the report still generates on test failures — | |
| # a regression IS the report's reason to exist. | |
| python3 -m pytest tests/e2e/test_cascade.py \ | |
| -v \ | |
| --scenario-results-json=out/scenario-results.json \ | |
| --cost-cap-usd=${{ github.event.inputs.cost_cap_usd || '2.0' }} \ | |
| --junit-xml=out/junit-nightly.xml \ | |
| --color=yes || true | |
| - name: Generate report | |
| id: report | |
| run: | | |
| DATE=$(date -u +%F) | |
| echo "date=$DATE" >> "$GITHUB_OUTPUT" | |
| python3 scripts/e2e/generate_nightly_report.py \ | |
| --results-json out/scenario-results.json \ | |
| --report-dir docs/research/results/ \ | |
| --date "$DATE" | |
| # Calibration corpus — promotes the controlled hand-labelled set | |
| # (12 cases as of 2026-04-29) to a regression bed that runs | |
| # daily. Independent of the e2e corpus above: that one exercises | |
| # the proxy end-to-end against a (mocked) upstream; this one | |
| # asks the LLM judge "is the regex baseline still drifting from | |
| # ground truth on the same 12 verdicts as yesterday." Runs only | |
| # when MOONSHOT_API_KEY is configured (same defensive gate as | |
| # the e2e judge step). Fails-soft via `|| true` so the rest of | |
| # the workflow (artifact upload, e2e auto-PR) still completes | |
| # if the calibration call hits the cost cap or has a transient | |
| # API error. | |
| - name: Calibrate upstream judge | |
| id: calibrate | |
| env: | |
| MOONSHOT_API_KEY: ${{ secrets.MOONSHOT_API_KEY }} | |
| LLMTRACE_E2E_UPSTREAM_JUDGE_BACKEND: openai | |
| LLMTRACE_E2E_UPSTREAM_JUDGE_MODEL: kimi-k2.6 | |
| LLMTRACE_E2E_UPSTREAM_JUDGE_BASE_URL: https://api.moonshot.ai/v1 | |
| LLMTRACE_E2E_UPSTREAM_JUDGE_API_KEY_ENV: MOONSHOT_API_KEY | |
| # Tighter than the e2e cap ($0.50) because the corpus is 12 | |
| # cases not 50, runs daily, and a leak should fail loud sooner. | |
| # Expected spend: ~$0.02 (12 × $0.0016/call against kimi-k2.6). | |
| LLMTRACE_E2E_UPSTREAM_JUDGE_COST_CAP_USD: "0.10" | |
| run: | | |
| if [ -z "${MOONSHOT_API_KEY:-}" ]; then | |
| echo "::notice title=Calibration skipped::MOONSHOT_API_KEY secret not set; calibration corpus needs the LLM tier." | |
| echo "skipped=true" >> "$GITHUB_OUTPUT" | |
| else | |
| echo "::notice title=Calibration corpus::backend=${LLMTRACE_E2E_UPSTREAM_JUDGE_BACKEND}, model=${LLMTRACE_E2E_UPSTREAM_JUDGE_MODEL}, cap=\$${LLMTRACE_E2E_UPSTREAM_JUDGE_COST_CAP_USD}" | |
| python3 scripts/e2e/calibrate_upstream_judge.py \ | |
| --backend openai \ | |
| --model "${LLMTRACE_E2E_UPSTREAM_JUDGE_MODEL}" \ | |
| --base-url "${LLMTRACE_E2E_UPSTREAM_JUDGE_BASE_URL}" \ | |
| --api-key-env "${LLMTRACE_E2E_UPSTREAM_JUDGE_API_KEY_ENV}" \ | |
| --cost-cap-usd "${LLMTRACE_E2E_UPSTREAM_JUDGE_COST_CAP_USD}" \ | |
| || true | |
| echo "skipped=false" >> "$GITHUB_OUTPUT" | |
| fi | |
| - name: Upload artifacts | |
| if: always() | |
| uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7 | |
| with: | |
| name: e2e-nightly-${{ steps.report.outputs.date }} | |
| path: | | |
| out/ | |
| tests/e2e/.logs/ | |
| docs/research/results/upstream_judge_calibration_*.json | |
| if-no-files-found: warn | |
| # Auto-PR the report. peter-evans/create-pull-request handles the | |
| # branch creation, push, and PR open in one step. It's a no-op if | |
| # there are no changes (e.g. byte-identical report from yesterday) | |
| # so we can run unconditionally. | |
| # | |
| # `continue-on-error: true` keeps the run green when the repo's | |
| # "Allow GitHub Actions to create and approve pull requests" flag | |
| # is OFF (Settings → Actions → General). The actual e2e suite has | |
| # already passed at this point and the report sits in the artifact | |
| # bundle either way. The next step makes the failure visible in | |
| # the workflow summary so it's never silently ignored. | |
| - name: Open report PR | |
| id: open_pr | |
| continue-on-error: true | |
| uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 # v8 | |
| with: | |
| token: ${{ secrets.GITHUB_TOKEN }} | |
| commit-message: "chore(e2e): nightly report ${{ steps.report.outputs.date }}" | |
| title: "chore(e2e): nightly report ${{ steps.report.outputs.date }}" | |
| body: | | |
| Auto-generated by the E2E Nightly workflow (#100). | |
| See `docs/research/results/e2e_${{ steps.report.outputs.date }}.md` | |
| for the full report. The diff section is the primary value; | |
| zero regressions == merge without review. | |
| branch: auto/e2e-nightly-${{ steps.report.outputs.date }} | |
| delete-branch: true | |
| add-paths: | | |
| docs/research/results/e2e_*.md | |
| docs/research/results/e2e_*.json | |
| labels: | | |
| e2e-nightly | |
| automated | |
| # Calibration auto-PR — separate from the e2e auto-PR because | |
| # the two reports are independent artifacts with different | |
| # determinism contracts (calibration markdown is byte-stable | |
| # when verdicts are identical; the JSON sidecar is not, which | |
| # is why it is deliberately excluded from `add-paths` below). | |
| - name: Open calibration PR | |
| if: steps.calibrate.outputs.skipped != 'true' | |
| id: open_calibration_pr | |
| continue-on-error: true | |
| uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 # v8 | |
| with: | |
| token: ${{ secrets.GITHUB_TOKEN }} | |
| commit-message: "chore(e2e): calibration report ${{ steps.report.outputs.date }}" | |
| title: "chore(e2e): calibration report ${{ steps.report.outputs.date }}" | |
| body: | | |
| Auto-generated by the E2E Nightly workflow's calibration | |
| step (validation-gap 2b for IS-060, see #148). | |
| See `docs/research/results/upstream_judge_calibration_${{ steps.report.outputs.date }}.md` | |
| for the report. | |
| The diff vs the previous calibration report is the primary | |
| value; this PR will be a no-op on days where regex/LLM | |
| verdicts and rule-class counts match yesterday's run. | |
| The matching JSON sidecar | |
| (`upstream_judge_calibration_*.json`) carries the | |
| free-text reasons and exact per-call cost; it is NOT | |
| committed via this PR (excluded from add-paths) so | |
| non-deterministic per-call data does not produce daily | |
| review noise. The sidecar lives in the workflow's | |
| artifact bundle. | |
| branch: auto/calibration-${{ steps.report.outputs.date }} | |
| delete-branch: true | |
| add-paths: | | |
| docs/research/results/upstream_judge_calibration_*.md | |
| labels: | | |
| e2e-nightly | |
| calibration | |
| automated | |
| - name: Surface PR-step status | |
| if: always() | |
| run: | | |
| if [ "${{ steps.open_pr.outcome }}" = "success" ]; then | |
| echo "::notice title=Nightly report PR opened::Auto-PR submitted (or no diff vs previous run)." | |
| else | |
| { | |
| echo "## Nightly report PR step failed" | |
| echo "" | |
| echo "The full e2e corpus passed; only the auto-PR step failed." | |
| echo "Most common cause: repo Settings → Actions → General →" | |
| echo "Workflow permissions → 'Allow GitHub Actions to create and" | |
| echo "approve pull requests' is OFF." | |
| echo "" | |
| echo "Report and sidecar are still available in the run artifact" | |
| echo "(\`e2e-nightly-${{ steps.report.outputs.date }}\`)." | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| echo "::warning title=Nightly auto-PR step failed::Check workflow summary for the fix." | |
| fi | |
| # Same fail-loud pattern for the calibration auto-PR. The | |
| # calibration step itself sets `skipped=true` when the | |
| # MOONSHOT_API_KEY secret is absent — that is a config | |
| # state, not a failure, so don't warn. | |
| if [ "${{ steps.calibrate.outputs.skipped }}" = "true" ]; then | |
| echo "::notice title=Calibration PR skipped::MOONSHOT_API_KEY not configured; nothing to PR." | |
| elif [ "${{ steps.open_calibration_pr.outcome }}" = "success" ]; then | |
| echo "::notice title=Calibration PR opened::Auto-PR submitted (or no diff vs previous run)." | |
| else | |
| { | |
| echo "## Calibration PR step failed" | |
| echo "" | |
| echo "The calibration script ran; only the auto-PR step failed." | |
| echo "Same root cause as the e2e auto-PR: workflow permissions." | |
| echo "" | |
| echo "Calibration report + sidecar JSON are in the run artifact" | |
| echo "(\`e2e-nightly-${{ steps.report.outputs.date }}\`)." | |
| } >> "$GITHUB_STEP_SUMMARY" | |
| echo "::warning title=Calibration auto-PR step failed::Check workflow summary." | |
| fi | |
| # Helper — fail-loud if we hit the wall-clock cap (matches L10 | |
| # acceptance). cancelled() fires on timeout-minutes. | |
| - name: Timeout helper | |
| if: cancelled() | |
| run: | | |
| echo "::error::Nightly exceeded 60 min wall-clock cap." | |
| echo "Knob: reduce the corpus size, parallelise scenarios" | |
| echo "(would also need conftest.py loosening serial guard)," | |
| echo "or split into two nightly jobs by family." | |
| exit 1 |