fix(ci): make live scenario reports non-blocking by default

Shaw · Shaw · commit ff553d4faf27 · 2026-05-04T05:00:24.000-07:00
diff --git a/.github/workflows/benchmark-weekly.yml b/.github/workflows/benchmark-weekly.yml
@@ -2,8 +2,9 @@
 #
 # Runs the full executive-assistant and connector certification scenario
 # catalogs through the live scenario runner and uploads a markdown + JSON
-# benchmark report as an artifact. Same pass gate as live-scenarios.yml
-# (LIFEOPS_JUDGE_THRESHOLD, default 0.8).
+# benchmark report as an artifact. Scheduled/default dispatch runs are
+# report-only so the weekly trend capture stays green while the live catalog
+# is still being hardened; manual dispatch can opt into enforcement.
 #
 # Distinct from live-scenarios.yml: weekly cadence, emits a human-readable
 # markdown report suitable for tracking trend lines over time.
@@ -31,6 +32,7 @@
 # Optional:
 #   LIFEOPS_JUDGE_THRESHOLD (workflow input, default 0.8)
 #   SCENARIO_FILTER (workflow input, empty = all cataloged benchmark scenarios)
+#   BENCHMARK_ENFORCE_GATE (workflow input, default false)
 
 name: Benchmark (weekly)
 
@@ -50,6 +52,11 @@ on:
         required: false
         type: string
         default: "0.8"
+      enforce_gate:
+        description: "Fail the workflow when benchmark scenarios fail"
+        required: false
+        type: boolean
+        default: false
 
 concurrency:
   group: benchmark-weekly-${{ github.ref }}
@@ -194,6 +201,7 @@ jobs:
           TRAVEL_BOOKING_API_KEY: ${{ secrets.TRAVEL_BOOKING_API_KEY }}
           SCENARIO_FILTER: ${{ inputs.scenario_filter }}
           LIFEOPS_JUDGE_THRESHOLD: ${{ inputs.judge_threshold || '0.8' }}
+          BENCHMARK_ENFORCE_GATE: ${{ inputs.enforce_gate && '1' || '0' }}
           BENCHMARK_REPORT_PATH: artifacts/benchmark-report.md
         # Trajectory data and ephemeral state land under ~/.eliza per
         # eliza-native ELIZA_STATE_DIR convention.
diff --git a/.github/workflows/live-scenarios.yml b/.github/workflows/live-scenarios.yml
@@ -1,9 +1,11 @@
 # Live Scenario Runner (nightly)
 #
 # Executes the 22 executive-assistant scenarios and 15 connector certification
-# scenarios against a live LLM runtime with real connector credentials. Fails
-# loudly when scenarios skip without SKIP_REASON, when any scenario fails, or
-# when the aggregate LLM-judge score falls below LIFEOPS_JUDGE_THRESHOLD.
+# scenarios against a live LLM runtime with real connector credentials and
+# uploads the JSON report. Scheduled/default dispatch runs are report-only
+# while this catalog is still being hardened; manual dispatch can opt into
+# failing when any scenario fails or the aggregate LLM-judge score falls below
+# LIFEOPS_JUDGE_THRESHOLD. Missing setup prerequisites still fail loudly.
 #
 # Required repo secrets (self-documented):
 #   LLM provider (at least one):
@@ -28,6 +30,7 @@
 # Optional:
 #   LIFEOPS_JUDGE_THRESHOLD (workflow input, default 0.8)
 #   SCENARIO_FILTER        (comma-separated scenario ids, default all)
+#   SCENARIO_ENFORCE_GATE  (workflow input, default false)
 #   SKIP_REASON            (required if SCENARIO_SKIP is set)
 #
 # 1Password vault: this workflow's plain `*_API_KEY` secrets are sourced from the
@@ -59,6 +62,11 @@ on:
         required: false
         type: string
         default: ""
+      enforce_gate:
+        description: "Fail the workflow when live scenarios fail"
+        required: false
+        type: boolean
+        default: false
 
 concurrency:
   group: live-scenarios-${{ github.ref }}
@@ -111,6 +119,55 @@ jobs:
             cd packages/schemas && bunx buf generate
           fi
 
+      - name: Build live scenario runtime packages
+        # The live runner executes TypeScript sources directly, but several
+        # workspace packages intentionally export dist/* entry points. Because
+        # dependency installation ignores postinstall scripts, build only the
+        # packages that the live scenario runtime imports through those exports.
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }}
+          GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
+          GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
+        run: |
+          echo "::group::Build packages/core"
+          bun run --cwd packages/core build
+          echo "::endgroup::"
+
+          provider_package=""
+          if [ -n "${GROQ_API_KEY:-}" ]; then
+            provider_package="plugins/plugin-groq"
+          elif [ -n "${OPENAI_API_KEY:-}" ]; then
+            provider_package="plugins/plugin-openai"
+          elif [ -n "${ANTHROPIC_API_KEY:-}" ]; then
+            provider_package="plugins/plugin-anthropic"
+          elif [ -n "${GOOGLE_GENERATIVE_AI_API_KEY:-}" ] || [ -n "${GOOGLE_API_KEY:-}" ]; then
+            provider_package="plugins/plugin-google-genai"
+          elif [ -n "${OPENROUTER_API_KEY:-}" ]; then
+            provider_package="plugins/plugin-openrouter"
+          fi
+
+          package_dirs=(
+            plugins/plugin-sql
+            plugins/plugin-agent-skills
+            plugins/plugin-pdf
+            plugins/plugin-telegram
+            plugins/plugin-whatsapp
+            plugins/plugin-signal
+            plugins/plugin-imessage
+          )
+          if [ -n "$provider_package" ]; then
+            package_dirs+=("$provider_package")
+          fi
+
+          for package_dir in "${package_dirs[@]}"; do
+            echo "::group::Build ${package_dir}"
+            bun run --cwd "$package_dir" build
+            echo "::endgroup::"
+          done
+
       - name: Run EA + connector live scenarios
         id: run
         env:
@@ -162,6 +219,7 @@ jobs:
           # Run controls
           SCENARIO_FILTER: ${{ inputs.scenario_filter }}
           LIFEOPS_JUDGE_THRESHOLD: ${{ inputs.judge_threshold || '0.8' }}
+          SCENARIO_ENFORCE_GATE: ${{ inputs.enforce_gate && '1' || '0' }}
           SKIP_REASON: ${{ inputs.skip_reason }}
           REPORT_PATH: artifacts/lifeops-scenario-report.json
         run: node scripts/run-live-scenarios.mjs
diff --git a/scripts/run-live-scenarios.mjs b/scripts/run-live-scenarios.mjs
@@ -21,6 +21,8 @@
  *   - SCENARIO_ROOT: scenario directory, relative to repo root or absolute
  *     (default: apps/app-lifeops/test/scenarios).
  *   - SCENARIO_INCLUDE_PENDING=1: include scenarios marked status="pending".
+ *   - SCENARIO_ENFORCE_GATE=0: keep the workflow green while still writing
+ *     the report when scenario assertions fail.
  *   - SKIP_REASON: required when any scenario is intentionally skipped.
  *   - REPORT_PATH: where to write the JSON report (default: artifacts/lifeops-scenario-report.json).
  *
@@ -107,14 +109,18 @@ if (filter.length > 0) {
 }
 
 const judgeThreshold = process.env.LIFEOPS_JUDGE_THRESHOLD ?? "0.8";
+const enforceGateValue = (process.env.SCENARIO_ENFORCE_GATE ?? "1")
+  .trim()
+  .toLowerCase();
+const enforceGate = !["0", "false", "no", "off"].includes(enforceGateValue);
 const env = {
   ...process.env,
   ELIZA_LIVE_TEST: "1",
   LIFEOPS_LIVE_JUDGE_MIN_SCORE: judgeThreshold,
 };
 
 console.log(
-  `[run-live-scenarios] threshold=${judgeThreshold} pending=${env.SCENARIO_INCLUDE_PENDING === "1" ? "included" : "excluded"} report=${reportPath} args=${args.slice(2).join(" ")}`,
+  `[run-live-scenarios] threshold=${judgeThreshold} enforce=${enforceGate ? "yes" : "no"} pending=${env.SCENARIO_INCLUDE_PENDING === "1" ? "included" : "excluded"} report=${reportPath} args=${args.slice(2).join(" ")}`,
 );
 
 const child = spawn(process.execPath, args, {
@@ -127,5 +133,12 @@ child.on("exit", (code, signal) => {
     console.error(`[run-live-scenarios] killed by signal ${signal}`);
     process.exit(1);
   }
-  process.exit(code ?? 1);
+  const exitCode = code ?? 1;
+  if (exitCode !== 0 && !enforceGate) {
+    console.warn(
+      `[run-live-scenarios] scenario gate exited ${exitCode}; SCENARIO_ENFORCE_GATE=0 so the report is non-blocking.`,
+    );
+    process.exit(0);
+  }
+  process.exit(exitCode);
 });
diff --git a/scripts/run-scenario-benchmark.mjs b/scripts/run-scenario-benchmark.mjs
@@ -15,6 +15,7 @@
  * Optional:
  *   LIFEOPS_JUDGE_THRESHOLD (default 0.8)
  *   SCENARIO_FILTER         (comma-separated ids)
+ *   BENCHMARK_ENFORCE_GATE  (default 1; set 0 for report-only runs)
  *   BENCHMARK_REPORT_PATH   (default: artifacts/benchmark-report.md)
  */
 
@@ -97,9 +98,13 @@ const runnerEnv = {
   SCENARIO_FILTER: scenariosToRun.join(","),
   REPORT_PATH: REPORT_JSON,
 };
+const enforceGateValue = (process.env.BENCHMARK_ENFORCE_GATE ?? "1")
+  .trim()
+  .toLowerCase();
+const enforceGate = !["0", "false", "no", "off"].includes(enforceGateValue);
 
 console.log(
-  `[benchmark] invoking scenario-runner for ${scenariosToRun.length} scenarios (threshold=${runnerEnv.LIFEOPS_JUDGE_THRESHOLD}, globs=${SCENARIO_FILE_GLOBS.join(",")})`,
+  `[benchmark] invoking scenario-runner for ${scenariosToRun.length} scenarios (threshold=${runnerEnv.LIFEOPS_JUDGE_THRESHOLD}, enforce=${enforceGate ? "yes" : "no"}, globs=${SCENARIO_FILE_GLOBS.join(",")})`,
 );
 
 const result = spawnSync(
@@ -135,6 +140,7 @@ function renderMarkdown() {
     `- Executed: ${report.totalCount ?? 0}`,
     `- Failed: ${report.failedCount ?? 0}`,
     `- Runner exit: ${runnerExitCode}`,
+    `- Enforcement: ${enforceGate ? "blocking" : "report-only"}`,
     "",
     "## Results",
     "",
@@ -180,4 +186,11 @@ writeFileSync(REPORT_MD, renderMarkdown(), "utf-8");
 console.log(`[benchmark] wrote markdown report to ${REPORT_MD}`);
 console.log(`[benchmark] wrote JSON report to ${REPORT_JSON}`);
 
+if (runnerExitCode !== 0 && !enforceGate) {
+  console.warn(
+    `[benchmark] scenario gate exited ${runnerExitCode}; BENCHMARK_ENFORCE_GATE=0 so the report is non-blocking.`,
+  );
+  process.exit(0);
+}
+
 process.exit(runnerExitCode);