mcpproxy-go/.github/workflows/eval.yml at 41fe212e3634977ac9511108d7849c99a47c44ba · smart-mcp-proxy/mcpproxy-go · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
name: Eval (Spec 065 regression gate)

# Spec 065 / C1 (FR-009, US3/P2): regression gate over the frozen Spec-065
# datasets. Two independent jobs so a network flake in D1 (retrieval) never
# masks or blocks the deterministic D2 (security) gate.
#
#   security-d2  — Go + Python only, no live upstreams. HARD gate (blocking).
#   retrieval-d1 — needs a live mcpproxy serving 7 reference servers; network
#                  dependent. Report-only on PRs, blocking on the nightly
#                  schedule (promote to PR-blocking after a green soak — see the
#                  plan on MCP-742). Reports are uploaded as artifacts, never
#                  committed (CN-003).
#
# mcp-eval (smart-mcp-proxy/mcp-eval) is a separate PUBLIC repo, checked out at a
# pinned ref — no token needed.

on:
  pull_request:
    paths:
      # D2 (security) system under test
      - "internal/security/**"
      # D1 (retrieval) system under test — BM25 index, MCP tool-discovery
      # routing, the REST search envelope, and server/CLI boot behavior the
      # retrieval eval depends on. Spec 065 requires CI to catch discovery
      # regressions when search/index/tool-discovery behavior changes.
      - "internal/index/**"
      - "internal/server/**"
      - "internal/httpapi/**"
      - "cmd/mcpproxy/**"
      # Eval harness + frozen datasets + this workflow
      - "cmd/scan-eval/**"
      - "specs/065-evaluation-foundation/datasets/**"
      - "scripts/eval-ci-smoke.sh"
      - ".github/workflows/eval.yml"
  workflow_dispatch: {}
  schedule:
    # Nightly soak (02:30 UTC) — exercises D1 against live upstreams.
    - cron: "30 2 * * *"

permissions:
  contents: read

env:
  MCP_EVAL_REF: "76df3a47e1480bfde2433b4f19df19312c985963" # SecurityScorer (B3) merge — pin for reproducibility
  PYTHON_VERSION: "3.11.13"

jobs:
  security-d2:
    name: Security regression gate (D2)
    runs-on: ubuntu-latest
    steps:
      - name: Checkout mcpproxy-go
        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3

      - name: Set up Go
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0
        with:
          go-version: "1.25"
          cache: true

      - name: Checkout mcp-eval (public, pinned)
        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
        with:
          repository: smart-mcp-proxy/mcp-eval
          ref: ${{ env.MCP_EVAL_REF }}
          path: mcp-eval

      - name: Set up uv + Python
        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
        with:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Sync mcp-eval environment
        working-directory: mcp-eval
        run: uv sync

      - name: Run D2 security gate
        env:
          MCP_EVAL_DIR: ${{ github.workspace }}/mcp-eval
          OUT_DIR: reports/security
        run: bash scripts/eval-ci-smoke.sh

      - name: Upload D2 reports
        if: always()
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: eval-security-d2
          path: reports/security/
          retention-days: 14
          if-no-files-found: ignore

      - name: Assert reports are not committed (CN-003)
        if: always()
        run: |
          tracked="$(git ls-files reports/ || true)"
          if [ -n "$tracked" ]; then
            echo "::error::Eval reports must never be committed (CN-003). Tracked under reports/:"
            echo "$tracked"
            exit 1
          fi
          echo "OK: no eval reports are tracked by git."

  retrieval-d1:
    name: Retrieval regression gate (D1)
    runs-on: ubuntu-latest
    # Report-only on PRs (D1 depends on npx/uvx package fetches — a known flake
    # source); blocking on the nightly schedule. Promote to PR-blocking after a
    # green soak (plan on MCP-742).
    continue-on-error: ${{ github.event_name == 'pull_request' }}
    steps:
      - name: Checkout mcpproxy-go
        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3

      - name: Set up Go
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5.6.0
        with:
          go-version: "1.25"
          cache: true

      - name: Set up Node.js (npx-launched servers)
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: "22"

      - name: Checkout mcp-eval (public, pinned)
        uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
        with:
          repository: smart-mcp-proxy/mcp-eval
          ref: ${{ env.MCP_EVAL_REF }}
          path: mcp-eval

      - name: Set up uv + Python (uvx-launched servers + mcp-eval)
        uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0
        with:
          python-version: ${{ env.PYTHON_VERSION }}

      - name: Sync mcp-eval environment
        working-directory: mcp-eval
        run: uv sync

      - name: Build mcpproxy
        run: go build -o mcpproxy ./cmd/mcpproxy

      # Boot, poll and score in ONE step: a server backgrounded in a separate
      # step is reaped when that step's shell exits, so start + wait + run must
      # share a shell. The trap stops the server however this step ends.
      - name: Run D1 retrieval gate (boot mcpproxy + score)
        working-directory: ${{ github.workspace }}
        env:
          DS: ${{ github.workspace }}/specs/065-evaluation-foundation/datasets
        run: |
          set -uo pipefail
          base="http://127.0.0.1:8092"; key="eval-corpus-snapshot"
          # data_dir must exist — `serve` refuses to create a missing one.
          mkdir -p "$RUNNER_TEMP/eval"
          ./mcpproxy serve \
            --config "$DS/snapshot-servers.config.json" \
            --data-dir "$RUNNER_TEMP/eval" \
            --listen 127.0.0.1:8092 \
            --log-level info > "$RUNNER_TEMP/mcpproxy.log" 2>&1 &
          server_pid=$!
          trap 'kill "$server_pid" 2>/dev/null || true' EXIT

          # Wait for the FULL tool catalog before scoring: the retrieval index is
          # built from the connected servers' tools, and scoring a partially
          # indexed instance tanks recall (a ≥1-result check fires far too early).
          # The 7 reference servers expose ~45 tools; require near-full + a short
          # settle for the index build. /api/v1/tools wraps as
          # {"success":true,"data":{"tools":[...]}}.
          ready=0
          expected=44
          for i in $(seq 1 60); do
            if ! kill -0 "$server_pid" 2>/dev/null; then
              echo "::error::mcpproxy process exited during startup"
              break
            fi
            t="$(curl -fsS -H "X-API-Key: $key" "$base/api/v1/tools" \
                 | python3 -c 'import sys,json;d=json.load(sys.stdin);print(len((d.get("data") or {}).get("tools", [])))' 2>/dev/null || echo 0)"
            echo "attempt $i: catalog has $t tool(s)"
            if [ "$t" -ge "$expected" ]; then
              ready=1; echo "Catalog full ($t tools); settling 8s for index build."; sleep 8; break
            fi
            sleep 5
          done
          if [ "$ready" != 1 ]; then
            echo "::error::mcpproxy catalog did not reach ${expected} tools in time"
            echo "----- mcpproxy.log (tail) -----"; tail -80 "$RUNNER_TEMP/mcpproxy.log" || true
            exit 1
          fi

          ( cd "$GITHUB_WORKSPACE/mcp-eval" && PYTHONPATH=src uv run python -m mcp_eval.cli retrieval \
            --corpus "$DS/corpus_v1.tools.json" \
            --golden "$DS/retrieval_golden_v1.json" \
            --baseline "$DS/baseline_v1.json" \
            --tolerance 0.05 \
            --runs 1 \
            --base-url "$base" \
            --api-key "$key" \
            --out-dir "$GITHUB_WORKSPACE/reports/retrieval" )

      - name: Upload D1 reports
        if: always()
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
        with:
          name: eval-retrieval-d1
          path: reports/retrieval/
          retention-days: 14
          if-no-files-found: ignore