|
1 | 1 | # Live Scenario Runner (nightly) |
2 | 2 | # |
3 | 3 | # Executes the 22 executive-assistant scenarios and 15 connector certification |
4 | | -# scenarios against a live LLM runtime with real connector credentials. Fails |
5 | | -# loudly when scenarios skip without SKIP_REASON, when any scenario fails, or |
6 | | -# when the aggregate LLM-judge score falls below LIFEOPS_JUDGE_THRESHOLD. |
| 4 | +# scenarios against a live LLM runtime with real connector credentials and |
| 5 | +# uploads the JSON report. Scheduled/default dispatch runs are report-only |
| 6 | +# while this catalog is still being hardened; manual dispatch can opt into |
| 7 | +# failing when any scenario fails or the aggregate LLM-judge score falls below |
| 8 | +# LIFEOPS_JUDGE_THRESHOLD. Missing setup prerequisites still fail loudly. |
7 | 9 | # |
8 | 10 | # Required repo secrets (self-documented): |
9 | 11 | # LLM provider (at least one): |
|
28 | 30 | # Optional: |
29 | 31 | # LIFEOPS_JUDGE_THRESHOLD (workflow input, default 0.8) |
30 | 32 | # SCENARIO_FILTER (comma-separated scenario ids, default all) |
| 33 | +# SCENARIO_ENFORCE_GATE (workflow input, default false) |
31 | 34 | # SKIP_REASON (required if SCENARIO_SKIP is set) |
32 | 35 | # |
33 | 36 | # 1Password vault: this workflow's plain `*_API_KEY` secrets are sourced from the |
|
59 | 62 | required: false |
60 | 63 | type: string |
61 | 64 | default: "" |
| 65 | + enforce_gate: |
| 66 | + description: "Fail the workflow when live scenarios fail" |
| 67 | + required: false |
| 68 | + type: boolean |
| 69 | + default: false |
62 | 70 |
|
63 | 71 | concurrency: |
64 | 72 | group: live-scenarios-${{ github.ref }} |
@@ -111,6 +119,55 @@ jobs: |
111 | 119 | cd packages/schemas && bunx buf generate |
112 | 120 | fi |
113 | 121 |
|
| 122 | + - name: Build live scenario runtime packages |
| 123 | + # The live runner executes TypeScript sources directly, but several |
| 124 | + # workspace packages intentionally export dist/* entry points. Because |
| 125 | + # dependency installation ignores postinstall scripts, build only the |
| 126 | + # packages that the live scenario runtime imports through those exports. |
| 127 | + env: |
| 128 | + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} |
| 129 | + OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} |
| 130 | + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} |
| 131 | + GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GOOGLE_GENERATIVE_AI_API_KEY }} |
| 132 | + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} |
| 133 | + GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }} |
| 134 | + run: | |
| 135 | + echo "::group::Build packages/core" |
| 136 | + bun run --cwd packages/core build |
| 137 | + echo "::endgroup::" |
| 138 | +
|
| 139 | + provider_package="" |
| 140 | + if [ -n "${GROQ_API_KEY:-}" ]; then |
| 141 | + provider_package="plugins/plugin-groq" |
| 142 | + elif [ -n "${OPENAI_API_KEY:-}" ]; then |
| 143 | + provider_package="plugins/plugin-openai" |
| 144 | + elif [ -n "${ANTHROPIC_API_KEY:-}" ]; then |
| 145 | + provider_package="plugins/plugin-anthropic" |
| 146 | + elif [ -n "${GOOGLE_GENERATIVE_AI_API_KEY:-}" ] || [ -n "${GOOGLE_API_KEY:-}" ]; then |
| 147 | + provider_package="plugins/plugin-google-genai" |
| 148 | + elif [ -n "${OPENROUTER_API_KEY:-}" ]; then |
| 149 | + provider_package="plugins/plugin-openrouter" |
| 150 | + fi |
| 151 | +
|
| 152 | + package_dirs=( |
| 153 | + plugins/plugin-sql |
| 154 | + plugins/plugin-agent-skills |
| 155 | + plugins/plugin-pdf |
| 156 | + plugins/plugin-telegram |
| 157 | + plugins/plugin-whatsapp |
| 158 | + plugins/plugin-signal |
| 159 | + plugins/plugin-imessage |
| 160 | + ) |
| 161 | + if [ -n "$provider_package" ]; then |
| 162 | + package_dirs+=("$provider_package") |
| 163 | + fi |
| 164 | +
|
| 165 | + for package_dir in "${package_dirs[@]}"; do |
| 166 | + echo "::group::Build ${package_dir}" |
| 167 | + bun run --cwd "$package_dir" build |
| 168 | + echo "::endgroup::" |
| 169 | + done |
| 170 | +
|
114 | 171 | - name: Run EA + connector live scenarios |
115 | 172 | id: run |
116 | 173 | env: |
@@ -162,6 +219,7 @@ jobs: |
162 | 219 | # Run controls |
163 | 220 | SCENARIO_FILTER: ${{ inputs.scenario_filter }} |
164 | 221 | LIFEOPS_JUDGE_THRESHOLD: ${{ inputs.judge_threshold || '0.8' }} |
| 222 | + SCENARIO_ENFORCE_GATE: ${{ inputs.enforce_gate && '1' || '0' }} |
165 | 223 | SKIP_REASON: ${{ inputs.skip_reason }} |
166 | 224 | REPORT_PATH: artifacts/lifeops-scenario-report.json |
167 | 225 | run: node scripts/run-live-scenarios.mjs |
|
0 commit comments