marin/.github/workflows/marin-canary-ferry-cw.yaml at 7e8b8a8f9e9fe8455fd3cc3c97076c7903919518 · marin-community/marin · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
name: Marin - CoreWeave GPU Canary Ferry

on:
  schedule:
    - cron: '0 10 * * *'  # Daily at 10 AM UTC
  workflow_dispatch:
    inputs:
      target_tokens:
        description: 'Override training token budget'
        type: number
        required: false
      keep_nodepool:
        description: 'Keep CW node pool alive after the run (for faster re-runs)'
        type: boolean
        default: false

permissions:
  contents: read   # actions/checkout
  packages: write  # docker login ghcr.io for iris cluster start
  issues: write    # claude triage files issues
  id-token: write  # claude-code-action OIDC

jobs:
  canary-ferry-cw:
    runs-on: ubuntu-latest
    timeout-minutes: 180
    concurrency:
      group: canary-ferry-cw-iris-canary
      cancel-in-progress: true
    env:
      RUN_ID: canary-gpu-${{ github.run_id }}-${{ github.run_attempt }}
      CANARY_ACCELERATOR: gpu
      CANARY_BATCH_SIZE: "16"
      CANARY_TARGET_TOKENS: "6553600"
      CANARY_MIN_STEPS: "40"
      CANARY_MAX_LOSS: "8.0"
      CANARY_MAX_WALL_CLOCK: "7200"
      WANDB_ENTITY: marin-community
      WANDB_PROJECT: marin
      IRIS_CONFIG: lib/iris/examples/coreweave-canary.yaml
      # Must match the label_prefix and namespace in IRIS_CONFIG so teardown
      # targets only this cluster's resources.
      IRIS_LABEL_PREFIX: iris-canary
      IRIS_NAMESPACE: iris-canary

    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Set up Python 3.12
        uses: actions/setup-python@v5
        with:
          python-version: "3.12"

      - name: Install uv
        uses: astral-sh/setup-uv@v7
        with:
          enable-cache: true

      - name: Install dependencies
        run: uv sync --all-packages --extra=cpu --no-default-groups

      - name: Write CoreWeave kubeconfig
        run: |
          mkdir -p ~/.kube
          echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris
          chmod 600 ~/.kube/coreweave-iris

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Log in to GitHub Container Registry
        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GITHUB_TOKEN }}

      - name: Start CoreWeave cluster
        run: .venv/bin/iris -v --config=${{ env.IRIS_CONFIG }} cluster start
        env:
          BUILDKIT_PROGRESS: plain
          R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
          R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}

      - name: Submit canary ferry
        id: submit
        shell: bash -l {0}
        run: |
          JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
            job run --no-wait \
            --memory=16G --disk=16G --cpu=1 --extra=cpu \
            -e MARIN_PREFIX s3://marin-na/marin/ \
            -e RUN_ID "$RUN_ID" \
            -e CANARY_ACCELERATOR "$CANARY_ACCELERATOR" \
            -e CANARY_BATCH_SIZE "$CANARY_BATCH_SIZE" \
            -e CANARY_TARGET_TOKENS "$CANARY_TARGET_TOKENS" \
            -e WANDB_ENTITY "$WANDB_ENTITY" \
            -e WANDB_PROJECT "$WANDB_PROJECT" \
            -e WANDB_API_KEY "$WANDB_API_KEY" \
            -e HF_TOKEN "$HF_TOKEN" \
            -e AWS_ACCESS_KEY_ID "$R2_ACCESS_KEY_ID" \
            -e AWS_SECRET_ACCESS_KEY "$R2_SECRET_ACCESS_KEY" \
            -e AWS_ENDPOINT_URL "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com" \
            -- python -m experiments.ferries.canary_ferry)
          echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT"
          echo "Submitted job: $JOB_ID"
        env:
          WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
          R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
          CANARY_TARGET_TOKENS: ${{ inputs.target_tokens || env.CANARY_TARGET_TOKENS }}

      - name: Wait for canary ferry
        shell: bash -l {0}
        run: |
          JOB_ID="${{ steps.submit.outputs.job_id }}"
          echo "Polling job status: $JOB_ID"
          while true; do
            STATE=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
              job list --json --prefix "$JOB_ID" \
              | jq -r --arg id "$JOB_ID" '[.[] | select(.job_id == $id)][0].state // empty')
            case "$STATE" in
              JOB_STATE_SUCCEEDED)
                echo "Job succeeded"
                exit 0
                ;;
              JOB_STATE_PENDING|JOB_STATE_BUILDING|JOB_STATE_RUNNING)
                echo "$(date -u +%H:%M:%S) Job state: $STATE"
                sleep 30
                ;;
              "")
                echo "Job not found: $JOB_ID"
                exit 1
                ;;
              *)
                echo "Job finished with state: $STATE"
                .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
                  job list --json --prefix "$JOB_ID" \
                  | jq --arg id "$JOB_ID" '.[] | {job_id, state, error}' || true
                exit 1
                ;;
            esac
          done

      - name: Validate canary metrics
        continue-on-error: ${{ inputs.target_tokens != '' }}
        shell: bash -l {0}
        run: .venv/bin/python scripts/canary/validate_canary_metrics.py
        env:
          MARIN_PREFIX: s3://marin-na/marin/
          AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
          AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com

      - name: Summarize GPU canary profile
        continue-on-error: true
        shell: bash -l {0}
        run: |
          .venv/bin/python -m marin.profiling.cli summarize \
            --run-target "$RUN_ID" \
            --entity "$WANDB_ENTITY" \
            --project "$WANDB_PROJECT"
        env:
          WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}

      - name: Capture failure diagnostics
        if: failure()
        run: |
          echo "=== Controller logs ==="
          kubectl --kubeconfig ~/.kube/coreweave-iris -n ${{ env.IRIS_NAMESPACE }} \
            logs -l app=iris-controller --tail=500 || true
          echo "=== Controller pod describe ==="
          kubectl --kubeconfig ~/.kube/coreweave-iris -n ${{ env.IRIS_NAMESPACE }} \
            describe pod -l app=iris-controller || true
          echo "=== Task pod logs ==="
          kubectl --kubeconfig ~/.kube/coreweave-iris -n ${{ env.IRIS_NAMESPACE }} \
            logs -l iris.managed=true --tail=200 || true
          echo "=== Task pod describe ==="
          kubectl --kubeconfig ~/.kube/coreweave-iris -n ${{ env.IRIS_NAMESPACE }} \
            describe pod -l iris.managed=true || true
          echo "=== Warning events ==="
          kubectl --kubeconfig ~/.kube/coreweave-iris -n ${{ env.IRIS_NAMESPACE }} \
            get events --sort-by='.lastTimestamp' --field-selector type!=Normal || true

      - name: Claude triage
        id: claude_triage
        if: failure() && github.event_name == 'schedule'
        uses: anthropics/claude-code-action@v1
        timeout-minutes: 30
        with:
          claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN || secrets.CLAUDE_MAX_OAUTH_TOKEN }}
          prompt: |
            Read .agents/skills/canary-triage/SKILL.md and follow it.
          claude_args: |
            --model opus
            --max-turns 50
            --allowedTools "Bash(kubectl:*),Bash(gh:*),Bash(.venv/bin/iris:*),Bash(.venv/bin/python:*),Bash(cat:*),Bash(jq:*),Bash(head:*),Bash(tail:*),Bash(grep:*)"
        env:
          CANARY_LANE: gpu
          CANARY_JOB_ID: ${{ steps.submit.outputs.job_id }}
          CANARY_RUN_ID: ${{ env.RUN_ID }}
          IRIS_CONFIG: ${{ env.IRIS_CONFIG }}
          IRIS_NAMESPACE: ${{ env.IRIS_NAMESPACE }}
          WANDB_ENTITY: ${{ env.WANDB_ENTITY }}
          WANDB_PROJECT: ${{ env.WANDB_PROJECT }}
          WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
          GHA_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

      # `cluster stop` only deletes Pods; NodePools survive and rely on the
      # CW autoscaler to scale down. Delete them explicitly to avoid lingering
      # H100 costs.
      - name: Tear down CoreWeave cluster
        if: always()
        run: |
          .venv/bin/iris -v --config=${{ env.IRIS_CONFIG }} cluster stop || true
          if [ "${{ inputs.keep_nodepool }}" != "true" ]; then
            kubectl --kubeconfig ~/.kube/coreweave-iris \
              delete nodepool -l iris-${{ env.IRIS_LABEL_PREFIX }}-managed=true
          else
            echo "Keeping node pool alive (keep_nodepool=true)"
          fi

      - name: Notify Slack on failure
        if: failure() && github.event_name == 'schedule'
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
          FALLBACK_TEXT: ":red_circle: *GPU Canary failed*\nRun: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
        run: |
          if [ -f slack_message.md ]; then
            TEXT=$(cat slack_message.md)
          else
            TEXT="$FALLBACK_TEXT"
          fi
          PAYLOAD=$(python3 -c "import sys,json; print(json.dumps({'text': sys.stdin.read()}))" <<< "$TEXT")
          curl -sf -X POST -H 'Content-Type: application/json' -d "$PAYLOAD" "$SLACK_WEBHOOK_URL"