-
Notifications
You must be signed in to change notification settings - Fork 1
136 lines (121 loc) · 3.85 KB
/
agent-eval.yml
File metadata and controls
136 lines (121 loc) · 3.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
name: Agent Eval
on:
workflow_dispatch:
inputs:
suite:
description: "Scenario suite to run (e.g. acmeauth, dub-ts, dub-go)"
required: true
type: choice
options:
- acmeauth
- acmeauth-value-add
- dub-go
- dub-python
- dub-ts
- dub-ts-value-add
- mistral-python
- mistral-ts
- pushpress-ts
provider:
description: "Agent provider"
required: false
type: choice
default: auto
options:
- auto
- anthropic
- openai
model:
description: "Model override (leave empty for provider default)"
required: false
type: string
include:
description: "Comma-separated scenario IDs to run (leave empty for all)"
required: false
type: string
max-concurrency:
description: "Max parallel scenarios"
required: false
type: number
default: 3
compare:
description: "Run A/B comparison (with vs without docs-mcp)"
required: false
type: boolean
default: false
debug:
description: "Enable verbose agent event logging"
required: false
type: boolean
default: false
jobs:
agent-eval:
runs-on: blacksmith-4vcpu-ubuntu-2404
timeout-minutes: 60
steps:
- name: Checkout
uses: actions/checkout@ff7abcd0c3c05ccf6adc123a8cd1fd4fb30fb493 # v5.0.0
- name: Setup Mise
uses: jdx/mise-action@6d1e696aa24c1aa1bcc1adea0212707c71ab78a8 # v3.6.1
with:
install: true
cache: true
env: false
- name: Prepare GitHub Actions environment
run: mise run github
- name: Cache PNPM
uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
with:
key: ${{ env.GH_CACHE_PNPM_KEY }}
restore-keys: |
${{ env.GH_CACHE_PNPM_KEY }}
${{ env.GH_CACHE_PNPM_KEY_PARTIAL }}
path: |
${{ env.PNPM_STORE_PATH }}
- name: Cache eval indexes and repos
uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
with:
key: agent-eval-cache-${{ inputs.suite }}
restore-keys: |
agent-eval-cache-${{ inputs.suite }}
agent-eval-cache-
path: |
.cache/indexes
.cache/repos
- name: Install dependencies
run: pnpm install --frozen-lockfile
- name: Build
run: pnpm build
- name: Run agent eval
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
NO_COLOR: "1"
run: |
args="--suite ${{ inputs.suite }}"
args="$args --provider ${{ inputs.provider }}"
args="$args --max-concurrency ${{ inputs.max-concurrency }}"
args="$args --out results.json"
if [ -n "${{ inputs.model }}" ]; then
args="$args --model ${{ inputs.model }}"
fi
if [ -n "${{ inputs.include }}" ]; then
args="$args --include ${{ inputs.include }}"
fi
if [ "${{ inputs.compare }}" = "true" ]; then
args="$args --compare"
fi
if [ "${{ inputs.debug }}" = "true" ]; then
args="$args --debug"
fi
echo "Running: node packages/eval/dist/bin.js agent-eval $args"
node packages/eval/dist/bin.js agent-eval $args
- name: Upload results
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: agent-eval-${{ inputs.suite }}-${{ github.run_number }}
path: |
results.json
.eval-results/${{ inputs.suite }}/
retention-days: 90