Eval (Spec 065 regression gate) #33
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Eval (Spec 065 regression gate) | |
| # Spec 065 / C1 (FR-009, US3/P2): regression gate over the frozen Spec-065 | |
| # datasets. Two independent jobs so a network flake in D1 (retrieval) never | |
| # masks or blocks the deterministic D2 (security) gate. | |
| # | |
| # security-d2 — Go + Python only, no live upstreams. HARD gate (blocking). | |
| # retrieval-d1 — needs a live mcpproxy serving 7 reference servers; network | |
| # dependent. Report-only on PRs, blocking on the nightly | |
| # schedule (promote to PR-blocking after a green soak — see the | |
| # plan on MCP-742). Reports are uploaded as artifacts, never | |
| # committed (CN-003). | |
| # | |
| # mcp-eval (smart-mcp-proxy/mcp-eval) is a separate PUBLIC repo, checked out at a | |
| # pinned ref — no token needed. | |
| on: | |
| pull_request: | |
| paths: | |
| # D2 (security) system under test | |
| - "internal/security/**" | |
| # D1 (retrieval) system under test — BM25 index, MCP tool-discovery | |
| # routing, the REST search envelope, and server/CLI boot behavior the | |
| # retrieval eval depends on. Spec 065 requires CI to catch discovery | |
| # regressions when search/index/tool-discovery behavior changes. | |
| - "internal/index/**" | |
| - "internal/server/**" | |
| - "internal/httpapi/**" | |
| - "cmd/mcpproxy/**" | |
| # Eval harness + frozen datasets + this workflow | |
| - "cmd/scan-eval/**" | |
| - "specs/065-evaluation-foundation/datasets/**" | |
| - "scripts/eval-ci-smoke.sh" | |
| - ".github/workflows/eval.yml" | |
| workflow_dispatch: {} | |
| schedule: | |
| # Nightly soak (02:30 UTC) — exercises D1 against live upstreams. | |
| - cron: "30 2 * * *" | |
| permissions: | |
| contents: read | |
| env: | |
| MCP_EVAL_REF: "76df3a47e1480bfde2433b4f19df19312c985963" # SecurityScorer (B3) merge — pin for reproducibility | |
| PYTHON_VERSION: "3.11.13" | |
| jobs: | |
| security-d2: | |
| name: Security regression gate (D2) | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout mcpproxy-go | |
| uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 | |
| - name: Set up Go | |
| uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0 | |
| with: | |
| go-version: "1.25" | |
| cache: true | |
| - name: Checkout mcp-eval (public, pinned) | |
| uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 | |
| with: | |
| repository: smart-mcp-proxy/mcp-eval | |
| ref: ${{ env.MCP_EVAL_REF }} | |
| path: mcp-eval | |
| - name: Set up uv + Python | |
| uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 | |
| with: | |
| python-version: ${{ env.PYTHON_VERSION }} | |
| - name: Sync mcp-eval environment | |
| working-directory: mcp-eval | |
| run: uv sync | |
| - name: Run D2 security gate | |
| env: | |
| MCP_EVAL_DIR: ${{ github.workspace }}/mcp-eval | |
| OUT_DIR: reports/security | |
| run: bash scripts/eval-ci-smoke.sh | |
| - name: Upload D2 reports | |
| if: always() | |
| uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 | |
| with: | |
| name: eval-security-d2 | |
| path: reports/security/ | |
| retention-days: 14 | |
| if-no-files-found: ignore | |
| - name: Assert reports are not committed (CN-003) | |
| if: always() | |
| run: | | |
| tracked="$(git ls-files reports/ || true)" | |
| if [ -n "$tracked" ]; then | |
| echo "::error::Eval reports must never be committed (CN-003). Tracked under reports/:" | |
| echo "$tracked" | |
| exit 1 | |
| fi | |
| echo "OK: no eval reports are tracked by git." | |
| retrieval-d1: | |
| name: Retrieval regression gate (D1) | |
| runs-on: ubuntu-latest | |
| # Report-only on PRs (D1 depends on npx/uvx package fetches — a known flake | |
| # source); blocking on the nightly schedule. Promote to PR-blocking after a | |
| # green soak (plan on MCP-742). | |
| continue-on-error: ${{ github.event_name == 'pull_request' }} | |
| steps: | |
| - name: Checkout mcpproxy-go | |
| uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 | |
| - name: Set up Go | |
| uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0 | |
| with: | |
| go-version: "1.25" | |
| cache: true | |
| - name: Set up Node.js (npx-launched servers) | |
| uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0 | |
| with: | |
| node-version: "22" | |
| - name: Checkout mcp-eval (public, pinned) | |
| uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 | |
| with: | |
| repository: smart-mcp-proxy/mcp-eval | |
| ref: ${{ env.MCP_EVAL_REF }} | |
| path: mcp-eval | |
| - name: Set up uv + Python (uvx-launched servers + mcp-eval) | |
| uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 | |
| with: | |
| python-version: ${{ env.PYTHON_VERSION }} | |
| - name: Sync mcp-eval environment | |
| working-directory: mcp-eval | |
| run: uv sync | |
| - name: Build mcpproxy | |
| run: go build -o mcpproxy ./cmd/mcpproxy | |
| # Boot, poll and score in ONE step: a server backgrounded in a separate | |
| # step is reaped when that step's shell exits, so start + wait + run must | |
| # share a shell. The trap stops the server however this step ends. | |
| - name: Run D1 retrieval gate (boot mcpproxy + score) | |
| working-directory: ${{ github.workspace }} | |
| env: | |
| DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets | |
| run: | | |
| set -uo pipefail | |
| base="http://127.0.0.1:8092"; key="eval-corpus-snapshot" | |
| # data_dir must exist — `serve` refuses to create a missing one. | |
| mkdir -p "$RUNNER_TEMP/eval" | |
| ./mcpproxy serve \ | |
| --config "$DS/snapshot-servers.config.json" \ | |
| --data-dir "$RUNNER_TEMP/eval" \ | |
| --listen 127.0.0.1:8092 \ | |
| --log-level info > "$RUNNER_TEMP/mcpproxy.log" 2>&1 & | |
| server_pid=$! | |
| trap 'kill "$server_pid" 2>/dev/null || true' EXIT | |
| # Wait for the FULL tool catalog before scoring: the retrieval index is | |
| # built from the connected servers' tools, and scoring a partially | |
| # indexed instance tanks recall (a ≥1-result check fires far too early). | |
| # The 7 reference servers expose ~45 tools; require near-full + a short | |
| # settle for the index build. /api/v1/tools wraps as | |
| # {"success":true,"data":{"tools":[...]}}. | |
| ready=0 | |
| expected=44 | |
| for i in $(seq 1 60); do | |
| if ! kill -0 "$server_pid" 2>/dev/null; then | |
| echo "::error::mcpproxy process exited during startup" | |
| break | |
| fi | |
| t="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/tools" \ | |
| | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("tools", [])))' 2>/dev/null || echo 0)" | |
| echo "attempt $i: catalog has $t tool(s)" | |
| if [ "$t" -ge "$expected" ]; then | |
| ready=1; echo "Catalog full ($t tools); settling 8s for index build."; sleep 8; break | |
| fi | |
| sleep 5 | |
| done | |
| if [ "$ready" != 1 ]; then | |
| echo "::error::mcpproxy catalog did not reach ${expected} tools in time" | |
| echo "----- mcpproxy.log (tail) -----"; tail -80 "$RUNNER_TEMP/mcpproxy.log" || true | |
| exit 1 | |
| fi | |
| ( cd "$GITHUB_WORKSPACE/mcp-eval" && PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \ | |
| --corpus "$DS/corpus_v1.tools.json" \ | |
| --golden "$DS/retrieval_golden_v1.json" \ | |
| --baseline "$DS/baseline_v1.json" \ | |
| --tolerance 0.05 \ | |
| --runs 1 \ | |
| --base-url "$base" \ | |
| --api-key "$key" \ | |
| --out-dir "$GITHUB_WORKSPACE/reports/retrieval" ) | |
| - name: Upload D1 reports | |
| if: always() | |
| uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 | |
| with: | |
| name: eval-retrieval-d1 | |
| path: reports/retrieval/ | |
| retention-days: 14 | |
| if-no-files-found: ignore |