marin/.github/workflows/iris-coreweave-ci.yaml at 08bb5503e202b53d532519cbf4a32cd081f4eef2 · marin-community/marin · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
name: Iris - CoreWeave CI

on:
  pull_request:
    types: [opened, synchronize]
    paths:
      - "lib/iris/**"
  issue_comment:
    types: [created]
  workflow_dispatch:

permissions:
  contents: read
  packages: write
  pull-requests: read   # needed for issue_comment to access PR metadata
  statuses: write       # post commit status from issue_comment trigger

# Shared concurrency group with marin-canary-ferry-cw.yaml — both rebuild/roll
# the shared iris-ci controller and submit against the shared H100 in
# US-WEST-04A. Only one run cluster-wide at a time. cancel-in-progress=false
# so a mid-flight canary is not killed by a PR firing.
concurrency:
  group: iris-coreweave-ci-shared
  cancel-in-progress: false

jobs:
  cw-ci-test:
    if: >-
      (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
      github.event_name == 'workflow_dispatch' ||
      (
        github.event_name == 'issue_comment' &&
        github.event.issue.pull_request &&
        contains(github.event.comment.body, '/iris-ci-cw') &&
        (
          github.event.comment.author_association == 'MEMBER' ||
          github.event.comment.author_association == 'COLLABORATOR' ||
          github.event.comment.author_association == 'OWNER'
        )
      )
    runs-on: ubuntu-latest
    timeout-minutes: 60
    env:
      IRIS_NAMESPACE: iris-ci
      # Must match Labels(label_prefix).iris_managed from the cluster config
      IRIS_MANAGED_LABEL: iris-iris-ci-managed
    steps:
      - name: Checkout code
        uses: actions/checkout@v5
        with:
          ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }}

      - name: Set commit status to pending
        if: github.event_name == 'issue_comment'
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          sha=$(git rev-parse HEAD)
          gh api repos/${{ github.repository }}/statuses/"$sha" \
            -f state=pending \
            -f context="Iris CoreWeave CI" \
            -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true

      - name: Set up Python 3.12
        uses: actions/setup-python@v6
        with:
          python-version: "3.12"

      - name: Install uv
        uses: astral-sh/setup-uv@v7
        with:
          enable-cache: true
          cache-dependency-glob: "lib/iris/pyproject.toml"

      - name: Write kubeconfig
        run: |
          mkdir -p ~/.kube
          echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris
          chmod 600 ~/.kube/coreweave-iris

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v4

      - name: Log in to GitHub Container Registry
        uses: docker/login-action@v4
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      # Delete stale worker pods so the autoscaler recreates them with fresh images.
      # Nodepools (and their underlying nodes) survive — this is the "warm start".
      - name: Reset worker pods
        run: |
          export KUBECONFIG=~/.kube/coreweave-iris
          kubectl delete pods -n "$IRIS_NAMESPACE" -l "$IRIS_MANAGED_LABEL=true" --grace-period=0 --ignore-not-found || true

      # Rebuild images and (re)start the controller. `cluster start` is fully
      # idempotent on K8s: it applies namespace/RBAC/ConfigMap/Deployment/Service
      # and triggers a rollout restart, so both cold starts and warm restarts
      # work without needing to tunnel to an existing controller first.
      - name: Start controller
        env:
          R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
          R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
        run: |
          cd lib/iris && uv run --group dev iris -v \
            --config=examples/coreweave-ci.yaml \
            cluster start --fresh

      - name: Run integration tests
        env:
          WANDB_MODE: disabled
          WANDB_API_KEY: ""
          JAX_TRACEBACK_FILTERING: off
          # When set, the marin-on-iris test uploads fixtures and writes
          # intermediate data to S3 (R2) so remote Zephyr pods can access them.
          MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
          AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
          AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
          FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
        run: |
          export KUBECONFIG=~/.kube/coreweave-iris

          # Wait for rollout to fully settle (old pod terminated, exactly 1 ready).
          kubectl rollout status deployment/iris-controller -n "$IRIS_NAMESPACE" --timeout=120s
          kubectl wait pod -n "$IRIS_NAMESPACE" -l app=iris-controller \
            --for=condition=Ready --timeout=60s

          LOCAL_PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('',0)); print(s.getsockname()[1]); s.close()")
          kubectl port-forward -n "$IRIS_NAMESPACE" svc/iris-ci-controller-svc "${LOCAL_PORT}:10000" &
          PF_PID=$!
          echo "PF_PID=$PF_PID" >> "$GITHUB_ENV"
          echo "LOCAL_PORT=$LOCAL_PORT" >> "$GITHUB_ENV"

          IRIS_CONTROLLER_URL="http://localhost:${LOCAL_PORT}"

          # Wait for the port-forward tunnel to be usable.
          HEALTHY=false
          for i in $(seq 1 60); do
            if ! kill -0 "$PF_PID" 2>/dev/null; then
              echo "port-forward process died — restarting"
              kubectl port-forward -n "$IRIS_NAMESPACE" svc/iris-ci-controller-svc "${LOCAL_PORT}:10000" &
              PF_PID=$!
              echo "PF_PID=$PF_PID" >> "$GITHUB_ENV"
              sleep 2
              continue
            fi
            if curl -sf "$IRIS_CONTROLLER_URL/health" > /dev/null 2>&1; then
              HEALTHY=true
              break
            fi
            sleep 5
          done
          if [ "$HEALTHY" != "true" ]; then
            echo "Controller did not become healthy within timeout"
            exit 1
          fi

          uv run pytest tests/integration/iris/ \
            --controller-url "$IRIS_CONTROLLER_URL" \
            -v --tb=short --timeout=600 \
            -o "addopts=" \
            -x

      - name: Run full integration pipeline
        env:
          WANDB_MODE: disabled
          WANDB_API_KEY: ""
          JAX_TRACEBACK_FILTERING: off
          MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
          AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
          AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
          FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
        run: |
          export IRIS_CONTROLLER_URL="http://localhost:${LOCAL_PORT}"
          timeout 600 uv run pytest tests/test_integration_test.py \
            -m integration -o "addopts=" --timeout=600 -v -s

      - name: Stop port-forward
        if: always()
        run: |
          [ -n "$PF_PID" ] && kill "$PF_PID" 2>/dev/null || true
          pkill -f "kubectl port-forward.*$IRIS_NAMESPACE" 2>/dev/null || true

      - name: Capture failure diagnostics
        if: failure()
        env:
          LOG_DIR: ${{ github.workspace }}/iris-cw-logs
        run: |
          export KUBECONFIG=~/.kube/coreweave-iris
          mkdir -p "$LOG_DIR"

          # Stream to the GH Actions log for quick triage…
          echo "=== Controller logs (tail) ==="
          kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=500 || true
          echo "=== Controller pod describe ==="
          kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller || true
          echo "=== Worker pods ==="
          kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" || true
          echo "=== Warning events ==="
          kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' --field-selector type!=Normal || true

          # …and also persist per-pod logs + describe so failures in worker
          # containers are recoverable from the uploaded artifact, not just
          # the controller's view.
          kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=-1 --all-containers \
            > "$LOG_DIR/controller.log" 2>&1 || true
          kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=-1 --all-containers --previous \
            > "$LOG_DIR/controller-previous.log" 2>&1 || true
          kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller \
            > "$LOG_DIR/controller-describe.txt" 2>&1 || true

          for pod in $(kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" -o name 2>/dev/null); do
            safe=$(echo "$pod" | tr '/' '-')
            kubectl -n "$IRIS_NAMESPACE" logs "$pod" --tail=-1 --all-containers \
              > "$LOG_DIR/${safe}.log" 2>&1 || true
            kubectl -n "$IRIS_NAMESPACE" describe "$pod" \
              > "$LOG_DIR/${safe}-describe.txt" 2>&1 || true
          done

          kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' \
            > "$LOG_DIR/events.txt" 2>&1 || true

      - name: Upload failure diagnostics
        if: failure()
        uses: actions/upload-artifact@v4
        with:
          name: iris-cw-ci-logs
          path: iris-cw-logs/
          retention-days: 14
          if-no-files-found: ignore

      - name: Set commit status to result
        if: always() && github.event_name == 'issue_comment'
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          sha=$(git rev-parse HEAD)
          if [ "${{ job.status }}" = "success" ]; then
            state=success
          else
            state=failure
          fi
          gh api repos/${{ github.repository }}/statuses/"$sha" \
            -f state="$state" \
            -f context="Iris CoreWeave CI" \
            -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"