inspector/.github/workflows/deploy-mcp-prod.yml at main · MCPJam/inspector · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
name: Deploy MCP Production

# Manual promotion of the MCP Cloudflare Worker from staging
# (mcp-staging.mcpjam.com) to production (mcp.mcpjam.com). Intentionally
# workflow_dispatch-only — there's no auto-deploy-on-merge for prod,
# matching release.yml's philosophy that production is a deliberate last
# step, not a side effect of merging.
#
# Invocation paths:
#   1. Soundcheck's "Deploy MCP production" tile (POST /api/mcp/dispatch).
#   2. GitHub Actions UI → "Deploy MCP Production" → Run workflow.
#
# Reviewer gating lives on the `mcp-production` GitHub Environment in repo
# Settings → Environments, not in this file. Keeping the gate in the
# environment config means it applies uniformly to both invocation paths.

on:
  workflow_dispatch:

permissions:
  contents: read
  actions: read
  deployments: write

# Never cancel an in-flight production deploy just because a second
# dispatch arrived. Mid-deploy cancellation can leave the worker in a
# partially-provisioned state (cert attached, bindings not). Queue instead.
concurrency:
  group: mcp-production
  cancel-in-progress: false

env:
  NODE_VERSION: "24.14.0"
  # Resolved once here so the environment URL, smoke test, and job
  # summary all point at the same target. Falls back to the known
  # production hostname when the override var is unset — hardcoded in
  # the fallback for the same reason deploy-mcp-staging.yml hardcodes
  # its URL: cloudflare/wrangler-action@v3 returns a malformed
  # `deployment-url` output for custom-domain deploys (wrangler-action#396).
  PRODUCTION_URL: ${{ vars.MCP_WORKER_PRODUCTION_URL || 'https://mcp.mcpjam.com' }}

jobs:
  deploy:
    runs-on: ubuntu-latest
    timeout-minutes: 30
    environment:
      name: mcp-production
      url: ${{ env.PRODUCTION_URL }}

    steps:
      - name: Enforce main branch
        run: |
          if [ "${GITHUB_REF_NAME}" != "main" ]; then
            echo "deploy-mcp-prod.yml must run from main" >&2
            exit 1
          fi

      - name: Require Cloudflare credentials
        env:
          CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
          CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
        run: |
          if [ -z "$CLOUDFLARE_ACCOUNT_ID" ]; then
            echo "Missing GitHub Actions secret: CLOUDFLARE_ACCOUNT_ID" >&2
            exit 1
          fi
          if [ -z "$CLOUDFLARE_API_TOKEN" ]; then
            echo "Missing GitHub Actions secret: CLOUDFLARE_API_TOKEN" >&2
            exit 1
          fi

      - name: Checkout code
        uses: actions/checkout@v4

      # Refuse to promote a SHA whose MCP build inputs haven't been
      # verified on staging. The check is looser than an exact-SHA match
      # because `deploy-mcp-staging.yml` only fires on mcp-relevant path
      # changes — so most main commits legitimately never trigger a
      # staging run for their SHA, and an exact-SHA gate would lock
      # promotion out whenever main moved for unrelated reasons.
      #
      # Logic: find the latest successful staging run on main. If its
      # SHA matches current main, done. Otherwise, compare the two SHAs
      # and verify no mcp-relevant files changed between them — meaning
      # what's live on staging is byte-identical (for the worker's
      # purposes) to what we'd ship. The mcp-relevant path list mirrors
      # deploy-mcp-staging.yml's `paths:` trigger filter, minus the
      # workflow file itself (which isn't a build input).
      - name: Require green staging for current MCP build inputs
        uses: actions/github-script@v7
        with:
          script: |
            const workflowId = "deploy-mcp-staging.yml";
            const { owner, repo } = context.repo;
            const headSha = context.sha;

            // Fetch without branch/status filters — GitHub's search index
            // can lag minutes behind, causing false negatives on recent runs.
            const runs = await github.paginate(github.rest.actions.listWorkflowRuns, {
              owner,
              repo,
              workflow_id: workflowId,
              per_page: 100,
            });

            const latestSuccess = runs.find((run) => run.conclusion === "success");
            if (!latestSuccess) {
              core.setFailed(
                `No successful ${workflowId} run on main. Cannot promote.`
              );
              return;
            }

            if (latestSuccess.head_sha === headSha) {
              core.info(
                `Latest successful staging run ${latestSuccess.id} is at ${headSha} — safe to promote.`
              );
              return;
            }

            // Different SHAs: check if the diff includes any MCP build input.
            // GitHub's compare response caps `files` at 300; for the volumes
            // we see between two main commits that's a non-issue, but if
            // something ever needs a 300+ file diff it'll error loudly here.
            const { data: diff } = await github.rest.repos.compareCommitsWithBasehead({
              owner,
              repo,
              basehead: `${latestSuccess.head_sha}...${headSha}`,
            });

            const isMcpRelevant = (filename) =>
              filename.startsWith("mcp/") ||
              filename === "package.json" ||
              filename === "package-lock.json" ||
              filename === ".changeset/config.json";

            const touched = (diff.files ?? [])
              .map((f) => f.filename)
              .filter(isMcpRelevant);

            if (touched.length > 0) {
              core.setFailed(
                `Latest successful staging is at ${latestSuccess.head_sha.slice(0, 7)} ` +
                `but current main ${headSha.slice(0, 7)} has MCP-relevant changes not yet on ` +
                `staging: ${touched.join(", ")}. Wait for the next deploy-mcp-staging.yml run ` +
                `to land before promoting.`
              );
              return;
            }

            core.info(
              `Latest successful staging run ${latestSuccess.id} is at ` +
              `${latestSuccess.head_sha.slice(0, 7)}; current main ${headSha.slice(0, 7)} ` +
              `differs only in non-MCP paths — safe to promote.`
            );

      - name: Setup Node.js
        uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}
          cache: npm

      - name: Install workspace dependencies
        run: npm ci --legacy-peer-deps

      - name: Typecheck MCP worker
        run: npm run typecheck -w @mcpjam/mcp

      - name: Deploy production worker
        id: deploy
        uses: cloudflare/wrangler-action@v3
        with:
          apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
          accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
          gitHubToken: ${{ secrets.GITHUB_TOKEN }}
          workingDirectory: "mcp"
          command: deploy --env production

      - name: Smoke test landing page
        run: |
          # First deploy onto a new custom domain takes up to ~60s for
          # Cloudflare to provision the edge cert and propagate the
          # hostname. Each probe is bounded (--connect-timeout 3
          # --max-time 6) so one hung TCP/TLS handshake can't blow past
          # the retry budget and hold the mcp-production concurrency
          # slot. Later deploys onto the existing hostname land
          # near-instant but the retry loop is cheap.
          for attempt in $(seq 1 20); do
            if curl --fail --silent --show-error --connect-timeout 3 --max-time 6 "$PRODUCTION_URL" | grep -q "MCPJam MCP"; then
              echo "Smoke OK on attempt $attempt"
              exit 0
            fi
            echo "Attempt $attempt returned non-200 or mismatched content; retrying in 3s…"
            sleep 3
          done
          echo "Production URL never resolved correctly." >&2
          exit 1

      - name: Summarize production deployment
        run: |
          echo "## ✅ MCP production deployed" >> "$GITHUB_STEP_SUMMARY"
          echo "" >> "$GITHUB_STEP_SUMMARY"
          echo "- Worker: \`mcpjam-mcp-production\`" >> "$GITHUB_STEP_SUMMARY"
          echo "- URL: \`$PRODUCTION_URL\`" >> "$GITHUB_STEP_SUMMARY"
          echo "- MCP endpoint: \`$PRODUCTION_URL/mcp\`" >> "$GITHUB_STEP_SUMMARY"