-
Notifications
You must be signed in to change notification settings - Fork 31
208 lines (185 loc) · 8.02 KB
/
eval.yml
File metadata and controls
208 lines (185 loc) · 8.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
name: Eval (Spec 065 regression gate)
# Spec 065 / C1 (FR-009, US3/P2): regression gate over the frozen Spec-065
# datasets. Two independent jobs so a network flake in D1 (retrieval) never
# masks or blocks the deterministic D2 (security) gate.
#
# security-d2 — Go + Python only, no live upstreams. HARD gate (blocking).
# retrieval-d1 — needs a live mcpproxy serving 7 reference servers; network
# dependent. Report-only on PRs, blocking on the nightly
# schedule (promote to PR-blocking after a green soak — see the
# plan on MCP-742). Reports are uploaded as artifacts, never
# committed (CN-003).
#
# mcp-eval (smart-mcp-proxy/mcp-eval) is a separate PUBLIC repo, checked out at a
# pinned ref — no token needed.
on:
pull_request:
paths:
# D2 (security) system under test
- "internal/security/**"
# D1 (retrieval) system under test — BM25 index, MCP tool-discovery
# routing, the REST search envelope, and server/CLI boot behavior the
# retrieval eval depends on. Spec 065 requires CI to catch discovery
# regressions when search/index/tool-discovery behavior changes.
- "internal/index/**"
- "internal/server/**"
- "internal/httpapi/**"
- "cmd/mcpproxy/**"
# Eval harness + frozen datasets + this workflow
- "cmd/scan-eval/**"
- "specs/065-evaluation-foundation/datasets/**"
- "scripts/eval-ci-smoke.sh"
- ".github/workflows/eval.yml"
workflow_dispatch: {}
schedule:
# Nightly soak (02:30 UTC) — exercises D1 against live upstreams.
- cron: "30 2 * * *"
permissions:
contents: read
env:
MCP_EVAL_REF: "76df3a47e1480bfde2433b4f19df19312c985963" # SecurityScorer (B3) merge — pin for reproducibility
PYTHON_VERSION: "3.11.13"
jobs:
security-d2:
name: Security regression gate (D2)
runs-on: ubuntu-latest
steps:
- name: Checkout mcpproxy-go
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
- name: Set up Go
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0
with:
go-version: "1.25"
cache: true
- name: Checkout mcp-eval (public, pinned)
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with:
repository: smart-mcp-proxy/mcp-eval
ref: ${{ env.MCP_EVAL_REF }}
path: mcp-eval
- name: Set up uv + Python
uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Sync mcp-eval environment
working-directory: mcp-eval
run: uv sync
- name: Run D2 security gate
env:
MCP_EVAL_DIR: ${{ github.workspace }}/mcp-eval
OUT_DIR: reports/security
run: bash scripts/eval-ci-smoke.sh
- name: Upload D2 reports
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: eval-security-d2
path: reports/security/
retention-days: 14
if-no-files-found: ignore
- name: Assert reports are not committed (CN-003)
if: always()
run: |
tracked="$(git ls-files reports/ || true)"
if [ -n "$tracked" ]; then
echo "::error::Eval reports must never be committed (CN-003). Tracked under reports/:"
echo "$tracked"
exit 1
fi
echo "OK: no eval reports are tracked by git."
retrieval-d1:
name: Retrieval regression gate (D1)
runs-on: ubuntu-latest
# Report-only on PRs (D1 depends on npx/uvx package fetches — a known flake
# source); blocking on the nightly schedule. Promote to PR-blocking after a
# green soak (plan on MCP-742).
continue-on-error: ${{ github.event_name == 'pull_request' }}
steps:
- name: Checkout mcpproxy-go
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
- name: Set up Go
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0
with:
go-version: "1.25"
cache: true
- name: Set up Node.js (npx-launched servers)
uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
with:
node-version: "22"
- name: Checkout mcp-eval (public, pinned)
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with:
repository: smart-mcp-proxy/mcp-eval
ref: ${{ env.MCP_EVAL_REF }}
path: mcp-eval
- name: Set up uv + Python (uvx-launched servers + mcp-eval)
uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Sync mcp-eval environment
working-directory: mcp-eval
run: uv sync
- name: Build mcpproxy
run: go build -o mcpproxy ./cmd/mcpproxy
# Boot, poll and score in ONE step: a server backgrounded in a separate
# step is reaped when that step's shell exits, so start + wait + run must
# share a shell. The trap stops the server however this step ends.
- name: Run D1 retrieval gate (boot mcpproxy + score)
working-directory: ${{ github.workspace }}
env:
DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets
run: |
set -uo pipefail
base="http://127.0.0.1:8092"; key="eval-corpus-snapshot"
# data_dir must exist — `serve` refuses to create a missing one.
mkdir -p "$RUNNER_TEMP/eval"
./mcpproxy serve \
--config "$DS/snapshot-servers.config.json" \
--data-dir "$RUNNER_TEMP/eval" \
--listen 127.0.0.1:8092 \
--log-level info > "$RUNNER_TEMP/mcpproxy.log" 2>&1 &
server_pid=$!
trap 'kill "$server_pid" 2>/dev/null || true' EXIT
# Wait for the FULL tool catalog before scoring: the retrieval index is
# built from the connected servers' tools, and scoring a partially
# indexed instance tanks recall (a ≥1-result check fires far too early).
# The 7 reference servers expose ~45 tools; require near-full + a short
# settle for the index build. /api/v1/tools wraps as
# {"success":true,"data":{"tools":[...]}}.
ready=0
expected=44
for i in $(seq 1 60); do
if ! kill -0 "$server_pid" 2>/dev/null; then
echo "::error::mcpproxy process exited during startup"
break
fi
t="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/tools" \
| python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("tools", [])))' 2>/dev/null || echo 0)"
echo "attempt $i: catalog has $t tool(s)"
if [ "$t" -ge "$expected" ]; then
ready=1; echo "Catalog full ($t tools); settling 8s for index build."; sleep 8; break
fi
sleep 5
done
if [ "$ready" != 1 ]; then
echo "::error::mcpproxy catalog did not reach ${expected} tools in time"
echo "----- mcpproxy.log (tail) -----"; tail -80 "$RUNNER_TEMP/mcpproxy.log" || true
exit 1
fi
( cd "$GITHUB_WORKSPACE/mcp-eval" && PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \
--corpus "$DS/corpus_v1.tools.json" \
--golden "$DS/retrieval_golden_v1.json" \
--baseline "$DS/baseline_v1.json" \
--tolerance 0.05 \
--runs 1 \
--base-url "$base" \
--api-key "$key" \
--out-dir "$GITHUB_WORKSPACE/reports/retrieval" )
- name: Upload D1 reports
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: eval-retrieval-d1
path: reports/retrieval/
retention-days: 14
if-no-files-found: ignore