-
Notifications
You must be signed in to change notification settings - Fork 104
219 lines (199 loc) · 8.31 KB
/
marin-datakit-smoke.yaml
File metadata and controls
219 lines (199 loc) · 8.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
name: Marin - Datakit Smoke
on:
schedule:
- cron: '30 6 * * *' # Daily at 06:30 UTC, offset from canary
workflow_dispatch:
permissions:
contents: read
issues: write # claude triage files issues
id-token: write # claude-code-action OIDC
jobs:
datakit-smoke:
runs-on: ubuntu-latest
timeout-minutes: 120
concurrency:
group: datakit-smoke
cancel-in-progress: true
env:
SMOKE_RUN_ID: datakit-smoke-${{ github.run_id }}-${{ github.run_attempt }}
FERRY_STATUS_PATH: gs://marin-tmp-us-central1/ttl=1d/ci/datakit-smoke-${{ github.run_id }}-${{ github.run_attempt }}/ferry_run_status.json
WANDB_ENTITY: marin-community
WANDB_PROJECT: marin
IRIS_CONFIG: lib/iris/examples/marin.yaml
IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
- name: Install dependencies
run: uv sync --all-packages --extra=cpu --no-default-groups
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }}
- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v2
with:
project_id: ${{ secrets.GCP_PROJECT_ID }}
- name: Set up OS Login SSH key
run: |
mkdir -p ~/.ssh
ssh-keygen -t rsa -b 4096 -f ~/.ssh/google_compute_engine -N "" -q -C "gha-${{ github.run_id }}-${{ github.run_attempt }}"
chmod 600 ~/.ssh/google_compute_engine
gcloud compute os-login ssh-keys add \
--key-file ~/.ssh/google_compute_engine.pub \
--impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
--ttl=6h
- name: Submit datakit smoke ferry
id: submit
shell: bash -l {0}
run: |
JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job run --no-wait \
--memory=2G --disk=4G --cpu=1 --extra=cpu \
--priority production \
-e SMOKE_RUN_ID "$SMOKE_RUN_ID" \
-e FERRY_STATUS_PATH "$FERRY_STATUS_PATH" \
-e WANDB_ENTITY "$WANDB_ENTITY" \
-e WANDB_PROJECT "$WANDB_PROJECT" \
-e WANDB_API_KEY "$WANDB_API_KEY" \
-e HF_TOKEN "$HF_TOKEN" \
-- python -m experiments.ferries.datakit_ferry)
echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT"
echo "Submitted job: $JOB_ID"
env:
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
- name: Wait for datakit smoke ferry
shell: bash -l {0}
run: |
JOB_ID="${{ steps.submit.outputs.job_id }}"
echo "Polling job status: $JOB_ID"
while true; do
STATE=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job list --json --prefix "$JOB_ID" \
| jq -r --arg id "$JOB_ID" '[.[] | select(.job_id == $id)][0].state // empty')
case "$STATE" in
JOB_STATE_SUCCEEDED)
echo "Job succeeded"
exit 0
;;
JOB_STATE_PENDING|JOB_STATE_BUILDING|JOB_STATE_RUNNING)
echo "$(date -u +%H:%M:%S) Job state: $STATE"
sleep 30
;;
"")
echo "Job not found: $JOB_ID"
exit 1
;;
*)
echo "Job finished with state: $STATE"
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job list --json --prefix "$JOB_ID" \
| jq --arg id "$JOB_ID" '.[] | {job_id, state, error}' || true
exit 1
;;
esac
done
- name: Read ferry status
id: ferry_status
shell: bash -l {0}
run: |
PREFIX=$(.venv/bin/python -c "
import json
from rigging.filesystem import url_to_fs
fs, _ = url_to_fs('$FERRY_STATUS_PATH')
with fs.open('$FERRY_STATUS_PATH') as f:
print(json.load(f)['marin_prefix'])
")
echo "marin_prefix=$PREFIX" >> "$GITHUB_OUTPUT"
echo "Ferry output prefix: $PREFIX"
- name: Validate datakit smoke outputs
shell: bash -l {0}
env:
MARIN_PREFIX: ${{ steps.ferry_status.outputs.marin_prefix }}
run: .venv/bin/python scripts/datakit/validate_ferry_outputs.py
- name: Capture failure diagnostics
if: failure() || cancelled()
run: |
echo "=== Controller logs ==="
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
process logs --max-lines=200 || true
echo "=== Job list ==="
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job list --json 2>/dev/null | jq '.[0:5]' || true
# The canary-triage skill handles both lanes; CANARY_LANE selects datakit-smoke vs tpu.
- name: Claude triage
id: claude_triage
if: (failure() || cancelled()) && github.event_name == 'schedule'
uses: anthropics/claude-code-action@v1
timeout-minutes: 30
with:
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN || secrets.CLAUDE_MAX_OAUTH_TOKEN }}
prompt: |
Read .agents/skills/canary-triage/SKILL.md and follow it.
claude_args: |
--model opus
--max-turns 500
--allowedTools "Bash(gh:*),Bash(.venv/bin/iris:*),Bash(.venv/bin/python:*),Bash(cat:*),Bash(jq:*),Bash(head:*),Bash(tail:*),Bash(grep:*)"
env:
CANARY_LANE: datakit-smoke
CANARY_JOB_ID: ${{ steps.submit.outputs.job_id }}
CANARY_RUN_ID: ${{ env.SMOKE_RUN_ID }}
IRIS_CONFIG: ${{ env.IRIS_CONFIG }}
WANDB_ENTITY: ${{ env.WANDB_ENTITY }}
WANDB_PROJECT: ${{ env.WANDB_PROJECT }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
GHA_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
# Upload Claude's Slack message (if it was written) so the separate
# notify-slack job can pick it up. We run Slack notify in a separate
# job because a job-level timeout can force-kill this runner before
# in-job steps after Claude triage ever execute — see
# https://github.com/marin-community/marin/actions/runs/24498461666.
- name: Upload Slack message
if: failure() || cancelled()
uses: actions/upload-artifact@v4
with:
name: slack-message
path: slack_message.md
retention-days: 1
if-no-files-found: ignore
- name: Remove OS Login SSH key
if: always()
run: |
gcloud compute os-login ssh-keys remove \
--impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \
--key-file ~/.ssh/google_compute_engine.pub || true
# Separate job so Slack always fires, even if the main job is force-killed
# after its grace window. `needs.datakit-smoke.result` reflects the main
# job outcome; failure()/cancelled() context functions only see this job's
# steps.
notify-slack:
needs: datakit-smoke
if: always() && (needs.datakit-smoke.result == 'failure' || needs.datakit-smoke.result == 'cancelled') && github.event_name == 'schedule'
runs-on: ubuntu-latest
steps:
- name: Download Slack message
uses: actions/download-artifact@v4
continue-on-error: true
with:
name: slack-message
- name: Notify Slack
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
FALLBACK_TEXT: ":red_circle: *Datakit Smoke failed*\nRun: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
run: |
if [ -f slack_message.md ]; then
TEXT=$(cat slack_message.md)
else
TEXT="$FALLBACK_TEXT"
fi
PAYLOAD=$(python3 -c "import sys,json; print(json.dumps({'text': sys.stdin.read()}))" <<< "$TEXT")
curl -sf -X POST -H 'Content-Type: application/json' -d "$PAYLOAD" "$SLACK_WEBHOOK_URL"