Skip to content

Commit ff553d4

Browse files
author
Shaw
committed
fix(ci): make live scenario reports non-blocking by default
1 parent 3285008 commit ff553d4

4 files changed

Lines changed: 100 additions & 8 deletions

File tree

.github/workflows/benchmark-weekly.yml

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
#
33
# Runs the full executive-assistant and connector certification scenario
44
# catalogs through the live scenario runner and uploads a markdown + JSON
5-
# benchmark report as an artifact. Same pass gate as live-scenarios.yml
6-
# (LIFEOPS_JUDGE_THRESHOLD, default 0.8).
5+
# benchmark report as an artifact. Scheduled/default dispatch runs are
6+
# report-only so the weekly trend capture stays green while the live catalog
7+
# is still being hardened; manual dispatch can opt into enforcement.
78
#
89
# Distinct from live-scenarios.yml: weekly cadence, emits a human-readable
910
# markdown report suitable for tracking trend lines over time.
@@ -31,6 +32,7 @@
3132
# Optional:
3233
# LIFEOPS_JUDGE_THRESHOLD (workflow input, default 0.8)
3334
# SCENARIO_FILTER (workflow input, empty = all cataloged benchmark scenarios)
35+
# BENCHMARK_ENFORCE_GATE (workflow input, default false)
3436

3537
name: Benchmark (weekly)
3638

@@ -50,6 +52,11 @@ on:
5052
required: false
5153
type: string
5254
default: "0.8"
55+
enforce_gate:
56+
description: "Fail the workflow when benchmark scenarios fail"
57+
required: false
58+
type: boolean
59+
default: false
5360

5461
concurrency:
5562
group: benchmark-weekly-${{ github.ref }}
@@ -194,6 +201,7 @@ jobs:
194201
TRAVEL_BOOKING_API_KEY: ${{ secrets.TRAVEL_BOOKING_API_KEY }}
195202
SCENARIO_FILTER: ${{ inputs.scenario_filter }}
196203
LIFEOPS_JUDGE_THRESHOLD: ${{ inputs.judge_threshold || '0.8' }}
204+
BENCHMARK_ENFORCE_GATE: ${{ inputs.enforce_gate && '1' || '0' }}
197205
BENCHMARK_REPORT_PATH: artifacts/benchmark-report.md
198206
# Trajectory data and ephemeral state land under ~/.eliza per
199207
# eliza-native ELIZA_STATE_DIR convention.

.github/workflows/live-scenarios.yml

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
# Live Scenario Runner (nightly)
22
#
33
# Executes the 22 executive-assistant scenarios and 15 connector certification
4-
# scenarios against a live LLM runtime with real connector credentials. Fails
5-
# loudly when scenarios skip without SKIP_REASON, when any scenario fails, or
6-
# when the aggregate LLM-judge score falls below LIFEOPS_JUDGE_THRESHOLD.
4+
# scenarios against a live LLM runtime with real connector credentials and
5+
# uploads the JSON report. Scheduled/default dispatch runs are report-only
6+
# while this catalog is still being hardened; manual dispatch can opt into
7+
# failing when any scenario fails or the aggregate LLM-judge score falls below
8+
# LIFEOPS_JUDGE_THRESHOLD. Missing setup prerequisites still fail loudly.
79
#
810
# Required repo secrets (self-documented):
911
# LLM provider (at least one):
@@ -28,6 +30,7 @@
2830
# Optional:
2931
# LIFEOPS_JUDGE_THRESHOLD (workflow input, default 0.8)
3032
# SCENARIO_FILTER (comma-separated scenario ids, default all)
33+
# SCENARIO_ENFORCE_GATE (workflow input, default false)
3134
# SKIP_REASON (required if SCENARIO_SKIP is set)
3235
#
3336
# 1Password vault: this workflow's plain `*_API_KEY` secrets are sourced from the
@@ -59,6 +62,11 @@ on:
5962
required: false
6063
type: string
6164
default: ""
65+
enforce_gate:
66+
description: "Fail the workflow when live scenarios fail"
67+
required: false
68+
type: boolean
69+
default: false
6270

6371
concurrency:
6472
group: live-scenarios-${{ github.ref }}
@@ -111,6 +119,55 @@ jobs:
111119
cd packages/schemas && bunx buf generate
112120
fi
113121
122+
- name: Build live scenario runtime packages
123+
# The live runner executes TypeScript sources directly, but several
124+
# workspace packages intentionally export dist/* entry points. Because
125+
# dependency installation ignores postinstall scripts, build only the
126+
# packages that the live scenario runtime imports through those exports.
127+
env:
128+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
129+
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
130+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
131+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
132+
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
133+
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
134+
run: |
135+
echo "::group::Build packages/core"
136+
bun run --cwd packages/core build
137+
echo "::endgroup::"
138+
139+
provider_package=""
140+
if [ -n "${GROQ_API_KEY:-}" ]; then
141+
provider_package="plugins/plugin-groq"
142+
elif [ -n "${OPENAI_API_KEY:-}" ]; then
143+
provider_package="plugins/plugin-openai"
144+
elif [ -n "${ANTHROPIC_API_KEY:-}" ]; then
145+
provider_package="plugins/plugin-anthropic"
146+
elif [ -n "${GOOGLE_GENERATIVE_AI_API_KEY:-}" ] || [ -n "${GOOGLE_API_KEY:-}" ]; then
147+
provider_package="plugins/plugin-google-genai"
148+
elif [ -n "${OPENROUTER_API_KEY:-}" ]; then
149+
provider_package="plugins/plugin-openrouter"
150+
fi
151+
152+
package_dirs=(
153+
plugins/plugin-sql
154+
plugins/plugin-agent-skills
155+
plugins/plugin-pdf
156+
plugins/plugin-telegram
157+
plugins/plugin-whatsapp
158+
plugins/plugin-signal
159+
plugins/plugin-imessage
160+
)
161+
if [ -n "$provider_package" ]; then
162+
package_dirs+=("$provider_package")
163+
fi
164+
165+
for package_dir in "${package_dirs[@]}"; do
166+
echo "::group::Build ${package_dir}"
167+
bun run --cwd "$package_dir" build
168+
echo "::endgroup::"
169+
done
170+
114171
- name: Run EA + connector live scenarios
115172
id: run
116173
env:
@@ -162,6 +219,7 @@ jobs:
162219
# Run controls
163220
SCENARIO_FILTER: ${{ inputs.scenario_filter }}
164221
LIFEOPS_JUDGE_THRESHOLD: ${{ inputs.judge_threshold || '0.8' }}
222+
SCENARIO_ENFORCE_GATE: ${{ inputs.enforce_gate && '1' || '0' }}
165223
SKIP_REASON: ${{ inputs.skip_reason }}
166224
REPORT_PATH: artifacts/lifeops-scenario-report.json
167225
run: node scripts/run-live-scenarios.mjs

scripts/run-live-scenarios.mjs

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
* - SCENARIO_ROOT: scenario directory, relative to repo root or absolute
2222
* (default: apps/app-lifeops/test/scenarios).
2323
* - SCENARIO_INCLUDE_PENDING=1: include scenarios marked status="pending".
24+
* - SCENARIO_ENFORCE_GATE=0: keep the workflow green while still writing
25+
* the report when scenario assertions fail.
2426
* - SKIP_REASON: required when any scenario is intentionally skipped.
2527
* - REPORT_PATH: where to write the JSON report (default: artifacts/lifeops-scenario-report.json).
2628
*
@@ -107,14 +109,18 @@ if (filter.length > 0) {
107109
}
108110

109111
const judgeThreshold = process.env.LIFEOPS_JUDGE_THRESHOLD ?? "0.8";
112+
const enforceGateValue = (process.env.SCENARIO_ENFORCE_GATE ?? "1")
113+
.trim()
114+
.toLowerCase();
115+
const enforceGate = !["0", "false", "no", "off"].includes(enforceGateValue);
110116
const env = {
111117
...process.env,
112118
ELIZA_LIVE_TEST: "1",
113119
LIFEOPS_LIVE_JUDGE_MIN_SCORE: judgeThreshold,
114120
};
115121

116122
console.log(
117-
`[run-live-scenarios] threshold=${judgeThreshold} pending=${env.SCENARIO_INCLUDE_PENDING === "1" ? "included" : "excluded"} report=${reportPath} args=${args.slice(2).join(" ")}`,
123+
`[run-live-scenarios] threshold=${judgeThreshold} enforce=${enforceGate ? "yes" : "no"} pending=${env.SCENARIO_INCLUDE_PENDING === "1" ? "included" : "excluded"} report=${reportPath} args=${args.slice(2).join(" ")}`,
118124
);
119125

120126
const child = spawn(process.execPath, args, {
@@ -127,5 +133,12 @@ child.on("exit", (code, signal) => {
127133
console.error(`[run-live-scenarios] killed by signal ${signal}`);
128134
process.exit(1);
129135
}
130-
process.exit(code ?? 1);
136+
const exitCode = code ?? 1;
137+
if (exitCode !== 0 && !enforceGate) {
138+
console.warn(
139+
`[run-live-scenarios] scenario gate exited ${exitCode}; SCENARIO_ENFORCE_GATE=0 so the report is non-blocking.`,
140+
);
141+
process.exit(0);
142+
}
143+
process.exit(exitCode);
131144
});

scripts/run-scenario-benchmark.mjs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
* Optional:
1616
* LIFEOPS_JUDGE_THRESHOLD (default 0.8)
1717
* SCENARIO_FILTER (comma-separated ids)
18+
* BENCHMARK_ENFORCE_GATE (default 1; set 0 for report-only runs)
1819
* BENCHMARK_REPORT_PATH (default: artifacts/benchmark-report.md)
1920
*/
2021

@@ -97,9 +98,13 @@ const runnerEnv = {
9798
SCENARIO_FILTER: scenariosToRun.join(","),
9899
REPORT_PATH: REPORT_JSON,
99100
};
101+
const enforceGateValue = (process.env.BENCHMARK_ENFORCE_GATE ?? "1")
102+
.trim()
103+
.toLowerCase();
104+
const enforceGate = !["0", "false", "no", "off"].includes(enforceGateValue);
100105

101106
console.log(
102-
`[benchmark] invoking scenario-runner for ${scenariosToRun.length} scenarios (threshold=${runnerEnv.LIFEOPS_JUDGE_THRESHOLD}, globs=${SCENARIO_FILE_GLOBS.join(",")})`,
107+
`[benchmark] invoking scenario-runner for ${scenariosToRun.length} scenarios (threshold=${runnerEnv.LIFEOPS_JUDGE_THRESHOLD}, enforce=${enforceGate ? "yes" : "no"}, globs=${SCENARIO_FILE_GLOBS.join(",")})`,
103108
);
104109

105110
const result = spawnSync(
@@ -135,6 +140,7 @@ function renderMarkdown() {
135140
`- Executed: ${report.totalCount ?? 0}`,
136141
`- Failed: ${report.failedCount ?? 0}`,
137142
`- Runner exit: ${runnerExitCode}`,
143+
`- Enforcement: ${enforceGate ? "blocking" : "report-only"}`,
138144
"",
139145
"## Results",
140146
"",
@@ -180,4 +186,11 @@ writeFileSync(REPORT_MD, renderMarkdown(), "utf-8");
180186
console.log(`[benchmark] wrote markdown report to ${REPORT_MD}`);
181187
console.log(`[benchmark] wrote JSON report to ${REPORT_JSON}`);
182188

189+
if (runnerExitCode !== 0 && !enforceGate) {
190+
console.warn(
191+
`[benchmark] scenario gate exited ${runnerExitCode}; BENCHMARK_ENFORCE_GATE=0 so the report is non-blocking.`,
192+
);
193+
process.exit(0);
194+
}
195+
183196
process.exit(runnerExitCode);

0 commit comments

Comments
 (0)