Skip to content

E2E Nightly

E2E Nightly #37

Workflow file for this run

# E2E Nightly Workflow (Loop E2E-L10 of #91, issue #100).
#
# Runs the full 50-scenario adversarial corpus once per night and
# auto-PRs a markdown report. The diff vs the previous report is the
# primary value; the per-run roll-up is context.
#
# Today the upstream is the in-process FastAPI mock (matching the
# PR-gate workflow). When `LLMTRACE_E2E_REAL_UPSTREAM_URL` is wired up
# as a repo secret, the conftest will use it instead of the mock so
# the nightly exercises a real LLM. Cost cap (`--cost-cap-usd`) gates
# the session in either mode.
name: E2E Nightly
on:
# 03:00 UTC daily — well outside US/EU business hours so a flaky
# run doesn't stall a PR review.
schedule:
- cron: "0 3 * * *"
workflow_dispatch:
inputs:
cost_cap_usd:
description: "Per-session cost cap in USD (default: 2.0)"
required: false
default: "2.0"
# Only one nightly at a time — no benefit to parallel runs since the
# state we care about (the committed report) is shared.
concurrency:
group: e2e-nightly
cancel-in-progress: false
permissions:
contents: write
pull-requests: write
jobs:
e2e-nightly:
name: E2E Nightly
runs-on: ubuntu-latest
# 1-hour hard cap per L10 acceptance. The full corpus is ~12 min
# against the mock; budget for real-LLM latency + retries.
timeout-minutes: 60
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6
with:
python-version: "3.12"
- name: Install Python deps
run: python3 -m pip install -r requirements-e2e.txt
- uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable
- name: Install protoc
run: sudo apt-get update && sudo apt-get install -y protobuf-compiler
- uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5
with:
path: |
~/.cargo/bin/
~/.cargo/registry/index/
~/.cargo/registry/cache/
~/.cargo/git/db/
target/
key: ${{ runner.os }}-cargo-e2e-nightly-${{ hashFiles('**/Cargo.lock') }}
restore-keys: |
${{ runner.os }}-cargo-e2e-nightly-
- name: Build llmtrace-proxy (release)
run: cargo build --release --manifest-path crates/llmtrace-proxy/Cargo.toml
- name: Run full corpus
env:
# Optional real-upstream override; conftest falls back to the
# in-process mock when unset so this workflow runs unchanged
# whether or not the secret is configured.
LLMTRACE_E2E_REAL_UPSTREAM_URL: ${{ secrets.LLMTRACE_E2E_REAL_UPSTREAM_URL }}
LLMTRACE_E2E_REAL_UPSTREAM_MODEL: ${{ secrets.LLMTRACE_E2E_REAL_UPSTREAM_MODEL }}
# Optional auth for real-upstream runs. Prefer the raw API-key
# secret; use the Authorization secret only for providers that
# require a non-Bearer scheme or additional header parameters.
LLMTRACE_E2E_REAL_UPSTREAM_API_KEY: ${{ secrets.LLMTRACE_E2E_REAL_UPSTREAM_API_KEY }}
LLMTRACE_E2E_REAL_UPSTREAM_AUTHORIZATION: ${{ secrets.LLMTRACE_E2E_REAL_UPSTREAM_AUTHORIZATION }}
LLMTRACE_JUDGE_OPENAI_API_KEY: ${{ secrets.LLMTRACE_JUDGE_OPENAI_API_KEY }}
# LLM-backed upstream-fell-for-it judge (#123). Auto-enabled
# when MOONSHOT_API_KEY is configured as a repo secret. Without
# it the harness keeps the regex baseline so the nightly never
# breaks on a missing secret. Calibration evidence committed
# at docs/research/results/upstream_judge_calibration_kimi-k2-6_2026-04-28.md
# — kimi-k2.6 hit 12/12 = 100% vs regex's 10/12 = 83.3% at
# ~$0.0016/call, so per-session cap of $0.50 is a safety margin
# not a budget.
MOONSHOT_API_KEY: ${{ secrets.MOONSHOT_API_KEY }}
LLMTRACE_E2E_UPSTREAM_JUDGE_BACKEND: openai
LLMTRACE_E2E_UPSTREAM_JUDGE_MODEL: kimi-k2.6
LLMTRACE_E2E_UPSTREAM_JUDGE_BASE_URL: https://api.moonshot.ai/v1
LLMTRACE_E2E_UPSTREAM_JUDGE_API_KEY_ENV: MOONSHOT_API_KEY
LLMTRACE_E2E_UPSTREAM_JUDGE_COST_CAP_USD: "0.50"
run: |
mkdir -p out
# Defensive gate: only flip to the LLM judge when its
# credential is present. RegexUpstreamJudge stays the
# fallback so the workflow degrades gracefully when no
# secret is configured.
if [ -n "${MOONSHOT_API_KEY:-}" ]; then
export LLMTRACE_E2E_UPSTREAM_JUDGE=llm
echo "::notice title=Upstream judge::LLMUpstreamJudge (backend=${LLMTRACE_E2E_UPSTREAM_JUDGE_BACKEND}, model=${LLMTRACE_E2E_UPSTREAM_JUDGE_MODEL}, cap=\$${LLMTRACE_E2E_UPSTREAM_JUDGE_COST_CAP_USD})"
else
echo "::notice title=Upstream judge::RegexUpstreamJudge (MOONSHOT_API_KEY secret not set; LLM backend skipped)"
fi
# `|| true` so the report still generates on test failures —
# a regression IS the report's reason to exist.
python3 -m pytest tests/e2e/test_cascade.py \
-v \
--scenario-results-json=out/scenario-results.json \
--cost-cap-usd=${{ github.event.inputs.cost_cap_usd || '2.0' }} \
--junit-xml=out/junit-nightly.xml \
--color=yes || true
- name: Generate report
id: report
run: |
DATE=$(date -u +%F)
echo "date=$DATE" >> "$GITHUB_OUTPUT"
python3 scripts/e2e/generate_nightly_report.py \
--results-json out/scenario-results.json \
--report-dir docs/research/results/ \
--date "$DATE"
# Calibration corpus — promotes the controlled hand-labelled set
# (12 cases as of 2026-04-29) to a regression bed that runs
# daily. Independent of the e2e corpus above: that one exercises
# the proxy end-to-end against a (mocked) upstream; this one
# asks the LLM judge "is the regex baseline still drifting from
# ground truth on the same 12 verdicts as yesterday." Runs only
# when MOONSHOT_API_KEY is configured (same defensive gate as
# the e2e judge step). Fails-soft via `|| true` so the rest of
# the workflow (artifact upload, e2e auto-PR) still completes
# if the calibration call hits the cost cap or has a transient
# API error.
- name: Calibrate upstream judge
id: calibrate
env:
MOONSHOT_API_KEY: ${{ secrets.MOONSHOT_API_KEY }}
LLMTRACE_E2E_UPSTREAM_JUDGE_BACKEND: openai
LLMTRACE_E2E_UPSTREAM_JUDGE_MODEL: kimi-k2.6
LLMTRACE_E2E_UPSTREAM_JUDGE_BASE_URL: https://api.moonshot.ai/v1
LLMTRACE_E2E_UPSTREAM_JUDGE_API_KEY_ENV: MOONSHOT_API_KEY
# Tighter than the e2e cap ($0.50) because the corpus is 12
# cases not 50, runs daily, and a leak should fail loud sooner.
# Expected spend: ~$0.02 (12 × $0.0016/call against kimi-k2.6).
LLMTRACE_E2E_UPSTREAM_JUDGE_COST_CAP_USD: "0.10"
run: |
if [ -z "${MOONSHOT_API_KEY:-}" ]; then
echo "::notice title=Calibration skipped::MOONSHOT_API_KEY secret not set; calibration corpus needs the LLM tier."
echo "skipped=true" >> "$GITHUB_OUTPUT"
else
echo "::notice title=Calibration corpus::backend=${LLMTRACE_E2E_UPSTREAM_JUDGE_BACKEND}, model=${LLMTRACE_E2E_UPSTREAM_JUDGE_MODEL}, cap=\$${LLMTRACE_E2E_UPSTREAM_JUDGE_COST_CAP_USD}"
python3 scripts/e2e/calibrate_upstream_judge.py \
--backend openai \
--model "${LLMTRACE_E2E_UPSTREAM_JUDGE_MODEL}" \
--base-url "${LLMTRACE_E2E_UPSTREAM_JUDGE_BASE_URL}" \
--api-key-env "${LLMTRACE_E2E_UPSTREAM_JUDGE_API_KEY_ENV}" \
--cost-cap-usd "${LLMTRACE_E2E_UPSTREAM_JUDGE_COST_CAP_USD}" \
|| true
echo "skipped=false" >> "$GITHUB_OUTPUT"
fi
- name: Upload artifacts
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7
with:
name: e2e-nightly-${{ steps.report.outputs.date }}
path: |
out/
tests/e2e/.logs/
docs/research/results/upstream_judge_calibration_*.json
if-no-files-found: warn
# Auto-PR the report. peter-evans/create-pull-request handles the
# branch creation, push, and PR open in one step. It's a no-op if
# there are no changes (e.g. byte-identical report from yesterday)
# so we can run unconditionally.
#
# `continue-on-error: true` keeps the run green when the repo's
# "Allow GitHub Actions to create and approve pull requests" flag
# is OFF (Settings → Actions → General). The actual e2e suite has
# already passed at this point and the report sits in the artifact
# bundle either way. The next step makes the failure visible in
# the workflow summary so it's never silently ignored.
- name: Open report PR
id: open_pr
continue-on-error: true
uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 # v8
with:
token: ${{ secrets.GITHUB_TOKEN }}
commit-message: "chore(e2e): nightly report ${{ steps.report.outputs.date }}"
title: "chore(e2e): nightly report ${{ steps.report.outputs.date }}"
body: |
Auto-generated by the E2E Nightly workflow (#100).
See `docs/research/results/e2e_${{ steps.report.outputs.date }}.md`
for the full report. The diff section is the primary value;
zero regressions == merge without review.
branch: auto/e2e-nightly-${{ steps.report.outputs.date }}
delete-branch: true
add-paths: |
docs/research/results/e2e_*.md
docs/research/results/e2e_*.json
labels: |
e2e-nightly
automated
# Calibration auto-PR — separate from the e2e auto-PR because
# the two reports are independent artifacts with different
# determinism contracts (calibration markdown is byte-stable
# when verdicts are identical; the JSON sidecar is not, which
# is why it is deliberately excluded from `add-paths` below).
- name: Open calibration PR
if: steps.calibrate.outputs.skipped != 'true'
id: open_calibration_pr
continue-on-error: true
uses: peter-evans/create-pull-request@5f6978faf089d4d20b00c7766989d076bb2fc7f1 # v8
with:
token: ${{ secrets.GITHUB_TOKEN }}
commit-message: "chore(e2e): calibration report ${{ steps.report.outputs.date }}"
title: "chore(e2e): calibration report ${{ steps.report.outputs.date }}"
body: |
Auto-generated by the E2E Nightly workflow's calibration
step (validation-gap 2b for IS-060, see #148).
See `docs/research/results/upstream_judge_calibration_${{ steps.report.outputs.date }}.md`
for the report.
The diff vs the previous calibration report is the primary
value; this PR will be a no-op on days where regex/LLM
verdicts and rule-class counts match yesterday's run.
The matching JSON sidecar
(`upstream_judge_calibration_*.json`) carries the
free-text reasons and exact per-call cost; it is NOT
committed via this PR (excluded from add-paths) so
non-deterministic per-call data does not produce daily
review noise. The sidecar lives in the workflow's
artifact bundle.
branch: auto/calibration-${{ steps.report.outputs.date }}
delete-branch: true
add-paths: |
docs/research/results/upstream_judge_calibration_*.md
labels: |
e2e-nightly
calibration
automated
- name: Surface PR-step status
if: always()
run: |
if [ "${{ steps.open_pr.outcome }}" = "success" ]; then
echo "::notice title=Nightly report PR opened::Auto-PR submitted (or no diff vs previous run)."
else
{
echo "## Nightly report PR step failed"
echo ""
echo "The full e2e corpus passed; only the auto-PR step failed."
echo "Most common cause: repo Settings → Actions → General →"
echo "Workflow permissions → 'Allow GitHub Actions to create and"
echo "approve pull requests' is OFF."
echo ""
echo "Report and sidecar are still available in the run artifact"
echo "(\`e2e-nightly-${{ steps.report.outputs.date }}\`)."
} >> "$GITHUB_STEP_SUMMARY"
echo "::warning title=Nightly auto-PR step failed::Check workflow summary for the fix."
fi
# Same fail-loud pattern for the calibration auto-PR. The
# calibration step itself sets `skipped=true` when the
# MOONSHOT_API_KEY secret is absent — that is a config
# state, not a failure, so don't warn.
if [ "${{ steps.calibrate.outputs.skipped }}" = "true" ]; then
echo "::notice title=Calibration PR skipped::MOONSHOT_API_KEY not configured; nothing to PR."
elif [ "${{ steps.open_calibration_pr.outcome }}" = "success" ]; then
echo "::notice title=Calibration PR opened::Auto-PR submitted (or no diff vs previous run)."
else
{
echo "## Calibration PR step failed"
echo ""
echo "The calibration script ran; only the auto-PR step failed."
echo "Same root cause as the e2e auto-PR: workflow permissions."
echo ""
echo "Calibration report + sidecar JSON are in the run artifact"
echo "(\`e2e-nightly-${{ steps.report.outputs.date }}\`)."
} >> "$GITHUB_STEP_SUMMARY"
echo "::warning title=Calibration auto-PR step failed::Check workflow summary."
fi
# Helper — fail-loud if we hit the wall-clock cap (matches L10
# acceptance). cancelled() fires on timeout-minutes.
- name: Timeout helper
if: cancelled()
run: |
echo "::error::Nightly exceeded 60 min wall-clock cap."
echo "Knob: reduce the corpus size, parallelise scenarios"
echo "(would also need conftest.py loosening serial guard),"
echo "or split into two nightly jobs by family."
exit 1