-
Notifications
You must be signed in to change notification settings - Fork 108
250 lines (223 loc) · 10.1 KB
/
iris-coreweave-ci.yaml
File metadata and controls
250 lines (223 loc) · 10.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
name: Iris - CoreWeave CI
on:
pull_request:
types: [opened, synchronize]
paths:
- "lib/iris/**"
issue_comment:
types: [created]
workflow_dispatch:
permissions:
contents: read
packages: write
pull-requests: read # needed for issue_comment to access PR metadata
statuses: write # post commit status from issue_comment trigger
# Shared concurrency group with marin-canary-ferry-cw.yaml — both rebuild/roll
# the shared iris-ci controller and submit against the shared H100 in
# US-WEST-04A. Only one run cluster-wide at a time. cancel-in-progress=false
# so a mid-flight canary is not killed by a PR firing.
concurrency:
group: iris-coreweave-ci-shared
cancel-in-progress: false
jobs:
cw-ci-test:
if: >-
(github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
github.event_name == 'workflow_dispatch' ||
(
github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
contains(github.event.comment.body, '/iris-ci-cw') &&
(
github.event.comment.author_association == 'MEMBER' ||
github.event.comment.author_association == 'COLLABORATOR' ||
github.event.comment.author_association == 'OWNER'
)
)
runs-on: ubuntu-latest
timeout-minutes: 60
env:
IRIS_NAMESPACE: iris-ci
# Must match Labels(label_prefix).iris_managed from the cluster config
IRIS_MANAGED_LABEL: iris-iris-ci-managed
steps:
- name: Checkout code
uses: actions/checkout@v5
with:
ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }}
- name: Set commit status to pending
if: github.event_name == 'issue_comment'
env:
GH_TOKEN: ${{ github.token }}
run: |
sha=$(git rev-parse HEAD)
gh api repos/${{ github.repository }}/statuses/"$sha" \
-f state=pending \
-f context="Iris CoreWeave CI" \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true
- name: Set up Python 3.12
uses: actions/setup-python@v6
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "lib/iris/pyproject.toml"
- name: Write kubeconfig
run: |
mkdir -p ~/.kube
echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris
chmod 600 ~/.kube/coreweave-iris
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v4
- name: Log in to GitHub Container Registry
uses: docker/login-action@v4
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
# Delete stale worker pods so the autoscaler recreates them with fresh images.
# Nodepools (and their underlying nodes) survive — this is the "warm start".
- name: Reset worker pods
run: |
export KUBECONFIG=~/.kube/coreweave-iris
kubectl delete pods -n "$IRIS_NAMESPACE" -l "$IRIS_MANAGED_LABEL=true" --grace-period=0 --ignore-not-found || true
# Rebuild images and (re)start the controller. `cluster start` is fully
# idempotent on K8s: it applies namespace/RBAC/ConfigMap/Deployment/Service
# and triggers a rollout restart, so both cold starts and warm restarts
# work without needing to tunnel to an existing controller first.
- name: Start controller
env:
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
run: |
cd lib/iris && uv run --group dev iris -v \
--config=examples/coreweave-ci.yaml \
cluster start --fresh
- name: Run integration tests
env:
WANDB_MODE: disabled
WANDB_API_KEY: ""
JAX_TRACEBACK_FILTERING: off
# When set, the marin-on-iris test uploads fixtures and writes
# intermediate data to S3 (R2) so remote Zephyr pods can access them.
MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
run: |
export KUBECONFIG=~/.kube/coreweave-iris
# Wait for rollout to fully settle (old pod terminated, exactly 1 ready).
kubectl rollout status deployment/iris-controller -n "$IRIS_NAMESPACE" --timeout=120s
kubectl wait pod -n "$IRIS_NAMESPACE" -l app=iris-controller \
--for=condition=Ready --timeout=60s
LOCAL_PORT=$(python3 -c "import socket; s=socket.socket(); s.bind(('',0)); print(s.getsockname()[1]); s.close()")
kubectl port-forward -n "$IRIS_NAMESPACE" svc/iris-ci-controller-svc "${LOCAL_PORT}:10000" &
PF_PID=$!
echo "PF_PID=$PF_PID" >> "$GITHUB_ENV"
echo "LOCAL_PORT=$LOCAL_PORT" >> "$GITHUB_ENV"
IRIS_CONTROLLER_URL="http://localhost:${LOCAL_PORT}"
# Wait for the port-forward tunnel to be usable.
HEALTHY=false
for i in $(seq 1 60); do
if ! kill -0 "$PF_PID" 2>/dev/null; then
echo "port-forward process died — restarting"
kubectl port-forward -n "$IRIS_NAMESPACE" svc/iris-ci-controller-svc "${LOCAL_PORT}:10000" &
PF_PID=$!
echo "PF_PID=$PF_PID" >> "$GITHUB_ENV"
sleep 2
continue
fi
if curl -sf "$IRIS_CONTROLLER_URL/health" > /dev/null 2>&1; then
HEALTHY=true
break
fi
sleep 5
done
if [ "$HEALTHY" != "true" ]; then
echo "Controller did not become healthy within timeout"
exit 1
fi
uv run pytest tests/integration/iris/ \
--controller-url "$IRIS_CONTROLLER_URL" \
-v --tb=short --timeout=600 \
-o "addopts=" \
-x
- name: Run full integration pipeline
env:
WANDB_MODE: disabled
WANDB_API_KEY: ""
JAX_TRACEBACK_FILTERING: off
MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
run: |
export IRIS_CONTROLLER_URL="http://localhost:${LOCAL_PORT}"
timeout 600 uv run pytest tests/test_integration_test.py \
-m integration -o "addopts=" --timeout=600 -v -s
- name: Stop port-forward
if: always()
run: |
[ -n "$PF_PID" ] && kill "$PF_PID" 2>/dev/null || true
pkill -f "kubectl port-forward.*$IRIS_NAMESPACE" 2>/dev/null || true
- name: Capture failure diagnostics
if: failure()
env:
LOG_DIR: ${{ github.workspace }}/iris-cw-logs
run: |
export KUBECONFIG=~/.kube/coreweave-iris
mkdir -p "$LOG_DIR"
# Stream to the GH Actions log for quick triage…
echo "=== Controller logs (tail) ==="
kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=500 || true
echo "=== Controller pod describe ==="
kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller || true
echo "=== Worker pods ==="
kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" || true
echo "=== Warning events ==="
kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' --field-selector type!=Normal || true
# …and also persist per-pod logs + describe so failures in worker
# containers are recoverable from the uploaded artifact, not just
# the controller's view.
kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=-1 --all-containers \
> "$LOG_DIR/controller.log" 2>&1 || true
kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=-1 --all-containers --previous \
> "$LOG_DIR/controller-previous.log" 2>&1 || true
kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller \
> "$LOG_DIR/controller-describe.txt" 2>&1 || true
for pod in $(kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" -o name 2>/dev/null); do
safe=$(echo "$pod" | tr '/' '-')
kubectl -n "$IRIS_NAMESPACE" logs "$pod" --tail=-1 --all-containers \
> "$LOG_DIR/${safe}.log" 2>&1 || true
kubectl -n "$IRIS_NAMESPACE" describe "$pod" \
> "$LOG_DIR/${safe}-describe.txt" 2>&1 || true
done
kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' \
> "$LOG_DIR/events.txt" 2>&1 || true
- name: Upload failure diagnostics
if: failure()
uses: actions/upload-artifact@v4
with:
name: iris-cw-ci-logs
path: iris-cw-logs/
retention-days: 14
if-no-files-found: ignore
- name: Set commit status to result
if: always() && github.event_name == 'issue_comment'
env:
GH_TOKEN: ${{ github.token }}
run: |
sha=$(git rev-parse HEAD)
if [ "${{ job.status }}" = "success" ]; then
state=success
else
state=failure
fi
gh api repos/${{ github.repository }}/statuses/"$sha" \
-f state="$state" \
-f context="Iris CoreWeave CI" \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"