fullsend/.github/workflows/functional-tests.yml at dcb302c65d6f0d690c748e807b4a1e90924c9f66 · fullsend-ai/fullsend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
name: Functional Tests

# PR-triggered functional tests use pull_request_target so fork PRs receive
# secrets.  Authorization runs in a separate gate job (base checkout only)
# before the test job checks out the PR head — same pattern as e2e.yml.

permissions: {}

on:
  push:
    branches: [main]
    # SYNC-WITH: grep regex in "Check for functional-test-relevant changes" step
    paths:
      - 'eval/**'
      - 'internal/scaffold/**'
      - '.github/workflows/functional-tests.yml'
      - '.github/scripts/**'
  pull_request_target:
    types: [opened, synchronize, reopened, labeled]
  merge_group:
  workflow_dispatch:

concurrency:
  group: >-
    ${{ github.event_name == 'pull_request_target'
        && format('functional-{0}', github.event.pull_request.number)
        || format('{0}-{1}', github.workflow, github.ref) }}
  cancel-in-progress: >-
    ${{ github.event_name == 'pull_request_target'
        || github.ref != 'refs/heads/main' }}

jobs:
  gate:
    # Separate job so pull-requests: write stays out of the job that checks
    # out fork head and runs tests with secrets.
    # Never checkout github.event.pull_request.head.sha here.
    if: >-
      github.event_name == 'pull_request_target' &&
      (github.event.action != 'labeled' || github.event.label.name == 'ok-to-test')
    runs-on: ubuntu-24.04
    timeout-minutes: 5
    permissions:
      contents: read
      pull-requests: write
    outputs:
      authorized: ${{ steps.auth.outputs.authorized }}
    steps:
      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        with:
          ref: ${{ github.sha }}  # Base branch only — never checkout PR head in gate

      - name: Check PR authorization
        id: auth
        uses: ./.github/actions/check-e2e-authorization
        with:
          pr_number: ${{ github.event.pull_request.number }}
          repository: ${{ github.repository }}
          pr_updated_at: ${{ github.event.pull_request.updated_at }}
          event_action: ${{ github.event.action }}
          pr_author_association: ${{ github.event.pull_request.author_association }}
          pr_author_login: ${{ github.event.pull_request.user.login }}

  functional-tests:
    # For pull_request_target, runs only when gate sets authorized=true.
    # Do not treat a skipped gate as authorized.
    # This job checks out untrusted PR head code — no pull-requests: write here.
    needs: gate
    if: >-
      !cancelled() &&
      (github.event_name != 'pull_request_target' || needs.gate.outputs.authorized == 'true')
    runs-on: ubuntu-24.04
    timeout-minutes: 45
    permissions:
      contents: read
      id-token: write
    steps:
      - name: Check for functional-test-relevant changes
        id: changes
        if: github.event_name == 'pull_request_target' || github.event_name == 'merge_group'
        env:
          GH_TOKEN: ${{ github.token }}
          EVENT_NAME: ${{ github.event_name }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
          REPO: ${{ github.repository }}
          MERGE_GROUP_BASE: ${{ github.event.merge_group.base_sha }}
          MERGE_GROUP_HEAD: ${{ github.event.merge_group.head_sha }}
        # SYNC-WITH: push.paths filter above
        run: |
          if [ "$EVENT_NAME" = "merge_group" ]; then
            FILES=$(gh api "repos/${REPO}/compare/${MERGE_GROUP_BASE}...${MERGE_GROUP_HEAD}" --jq '.files[].filename') || {
              echo "::warning::Failed to fetch merge group files — running functional tests as a precaution"
              echo "relevant=true" >> "$GITHUB_OUTPUT"
              exit 0
            }
            FILE_COUNT=$(echo "$FILES" | wc -l)
            if [ "$FILE_COUNT" -ge 300 ]; then
              echo "::warning::Compare API returned $FILE_COUNT files (possible truncation at 300) — running functional tests as a precaution"
              echo "relevant=true" >> "$GITHUB_OUTPUT"
              exit 0
            fi
          else
            FILES=$(gh api "repos/${REPO}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename') || {
              echo "::warning::Failed to fetch PR files — running functional tests as a precaution"
              echo "relevant=true" >> "$GITHUB_OUTPUT"
              exit 0
            }
          fi
          if echo "$FILES" | grep -qE '^eval/|^internal/scaffold/|^\.github/workflows/functional-tests\.yml$|^\.github/scripts/'; then
            echo "relevant=true" >> "$GITHUB_OUTPUT"
          else
            echo "::notice::No functional-test-relevant files changed — skipping tests"
            echo "relevant=false" >> "$GITHUB_OUTPUT"
          fi

      - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
        if: steps.changes.outputs.relevant != 'false'
        with:
          ref: ${{ github.event_name == 'pull_request_target' && github.event.pull_request.head.sha || github.sha }}
          persist-credentials: false
          # checkout@v7 blocks fork PR head checkouts on pull_request_target by default.
          # Safe here: gate job authorizes before this job runs; no pull-requests: write.
          allow-unsafe-pr-checkout: ${{ github.event_name == 'pull_request_target' }}
          submodules: true

      - uses: actions/setup-go@924ae3a1cded613372ab5595356fb5720e22ba16 # v6.5.0
        if: steps.changes.outputs.relevant != 'false'
        with:
          go-version-file: go.mod

      - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        if: steps.changes.outputs.relevant != 'false'
        with:
          python-version: "3.12"

      - name: Install uv
        if: steps.changes.outputs.relevant != 'false'
        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0

      - name: Install agent-eval-harness
        if: steps.changes.outputs.relevant != 'false'
        run: uv pip install --system -e 'eval/.agent-eval-harness[anthropic]'

      - name: Install yq
        if: steps.changes.outputs.relevant != 'false'
        run: |
          curl -sSfL "https://github.com/mikefarah/yq/releases/download/v4.47.1/yq_linux_amd64" -o /usr/local/bin/yq
          chmod +x /usr/local/bin/yq

      - name: Configure git identity
        if: steps.changes.outputs.relevant != 'false'
        run: |
          git config --global user.name "fullsend-eval[bot]"
          git config --global user.email "fullsend-eval[bot]@users.noreply.github.com"

      - name: Build fullsend
        if: steps.changes.outputs.relevant != 'false'
        run: make go-build

      - name: Add bin to PATH
        if: steps.changes.outputs.relevant != 'false'
        run: echo "${{ github.workspace }}/bin" >> "$GITHUB_PATH"

      - name: Configure OpenShell gateway
        if: steps.changes.outputs.relevant != 'false'
        run: |
          source .github/scripts/openshell-version.sh
          mkdir -p "$HOME/.config/openshell"
          echo "OPENSHELL_BIND_ADDRESS=0.0.0.0" > "$HOME/.config/openshell/gateway.env"
          cat > "$HOME/.config/openshell/gateway.toml" << EOF
          [openshell]
          version = 1

          [openshell.gateway]
          supervisor_image = "ghcr.io/nvidia/openshell/supervisor:${OPENSHELL_VERSION}"
          EOF

      - name: Install OpenShell CLI
        if: steps.changes.outputs.relevant != 'false'
        run: .github/scripts/install-openshell.sh

      - name: Install Podman
        if: steps.changes.outputs.relevant != 'false'
        run: |
          sudo apt-get update
          sudo apt-get install -y podman

      - name: Configure rootless Podman
        if: steps.changes.outputs.relevant != 'false'
        run: |
          whoami_user="$(whoami)"
          grep -q "^${whoami_user}:" /etc/subuid || sudo usermod --add-subuids 100000-165535 --add-subgids 100000-165535 "${whoami_user}"
          podman system migrate

      - name: Start Podman API service
        if: steps.changes.outputs.relevant != 'false'
        run: |
          SOCKET_PATH="${XDG_RUNTIME_DIR:-/run/user/$(id -u)}/podman/podman.sock"
          if [ ! -S "${SOCKET_PATH}" ]; then
            mkdir -p "$(dirname "${SOCKET_PATH}")"
            podman system service --time=0 "unix://${SOCKET_PATH}" &
            for _i in $(seq 1 30); do
              [ -S "${SOCKET_PATH}" ] && podman --url "unix://${SOCKET_PATH}" info >/dev/null 2>&1 && break
              sleep 1
            done
            [ -S "${SOCKET_PATH}" ] || { echo "::error::Podman socket not ready"; exit 1; }
          fi

      - name: Install validation dependencies
        if: steps.changes.outputs.relevant != 'false'
        run: pip install --quiet "jsonschema>=4.18.0"

      - name: Check for secrets
        if: steps.changes.outputs.relevant != 'false'
        id: secrets-check
        run: |
          if [ -z "$WIF_PROVIDER" ]; then
            echo "::warning::GCP secrets are not configured. Skipping functional tests."
            echo "available=false" >> "$GITHUB_OUTPUT"
          else
            echo "available=true" >> "$GITHUB_OUTPUT"
          fi
        env:
          WIF_PROVIDER: ${{ secrets.E2E_GCP_WIF_PROVIDER }}

      - name: Authenticate to GCP
        if: steps.changes.outputs.relevant != 'false' && steps.secrets-check.outputs.available == 'true'
        uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 # v3.0.0
        with:
          workload_identity_provider: ${{ secrets.E2E_GCP_WIF_PROVIDER }}
          service_account: ${{ secrets.E2E_GCP_SERVICE_ACCOUNT }}

      - name: Prepare sandbox credentials
        if: steps.changes.outputs.relevant != 'false' && steps.secrets-check.outputs.available == 'true'
        run: |
          echo "HOST_GOOGLE_APPLICATION_CREDENTIALS=$GOOGLE_APPLICATION_CREDENTIALS" >> "$GITHUB_ENV"
          bash internal/scaffold/fullsend-repo/scripts/prepare-sandbox-credentials.sh

      - name: Run functional tests
        if: steps.changes.outputs.relevant != 'false' && steps.secrets-check.outputs.available == 'true'
        env:
          EVAL_ORG: ${{ vars.EVAL_ORG }}
          GH_TOKEN: ${{ secrets.EVAL_GH_TOKEN }}
          ANTHROPIC_VERTEX_PROJECT_ID: ${{ vars.EVALS_VERTEX_PROJECT_ID }}
          GOOGLE_CLOUD_PROJECT: ${{ secrets.E2E_GCP_PROJECT_ID }}
          CLOUD_ML_REGION: ${{ vars.EVALS_GCP_REGION }}
          EVALS_HOST_CREDENTIALS: ${{ env.HOST_GOOGLE_APPLICATION_CREDENTIALS }}
        run: make functional-tests

      - name: Scrub secrets from eval results
        if: always() && steps.changes.outputs.relevant != 'false' && steps.secrets-check.outputs.available == 'true'
        run: find eval/runs/ -name '.eval-env' -delete 2>/dev/null || true; find /tmp/agent-eval/ -name '.eval-env' -delete 2>/dev/null || true

      - name: Upload eval results
        if: always() && steps.changes.outputs.relevant != 'false' && steps.secrets-check.outputs.available == 'true'
        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
        with:
          name: eval-results
          path: |
            eval/runs/
            !eval/runs/**/.eval-env
          retention-days: 30