-
Notifications
You must be signed in to change notification settings - Fork 136
192 lines (173 loc) · 7.33 KB
/
Copy pathiris-smoke-coreweave.yaml
File metadata and controls
192 lines (173 loc) · 7.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
name: "Iris - Smoke - CoreWeave"
on:
pull_request:
types: [opened, synchronize]
paths:
- "lib/iris/**"
- ".github/workflows/iris-smoke-coreweave.yaml"
- "scripts/workflows/iris_monitor.py"
workflow_dispatch:
permissions:
contents: read
packages: write
statuses: write # post commit status from workflow_dispatch trigger
# Shared concurrency group with marin-canary-ferry-coreweave.yaml — both rebuild/roll
# the shared iris-ci controller and submit against the shared H100 in
# US-WEST-04A. Only one run cluster-wide at a time. cancel-in-progress=false
# so a mid-flight canary is not killed by a PR firing.
concurrency:
group: iris-coreweave-ci-shared
cancel-in-progress: false
jobs:
cw-ci-test:
if: >-
(github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
github.event_name == 'workflow_dispatch'
runs-on: ubuntu-latest
timeout-minutes: 60
env:
IRIS_NAMESPACE: iris-ci
# Must match Labels(label_prefix).iris_managed from the cluster config
IRIS_MANAGED_LABEL: iris-iris-ci-managed
steps:
- name: Checkout code
uses: actions/checkout@v5
- name: Set commit status to pending
if: github.event_name == 'workflow_dispatch'
env:
GH_TOKEN: ${{ github.token }}
run: |
sha=$(git rev-parse HEAD)
gh api repos/${{ github.repository }}/statuses/"$sha" \
-f state=pending \
-f context="Iris CoreWeave CI" \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true
- name: Set up Python 3.12
uses: actions/setup-python@v6
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "lib/iris/pyproject.toml"
- name: Write kubeconfig
run: |
mkdir -p ~/.kube
echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris
chmod 600 ~/.kube/coreweave-iris
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v4
- name: Log in to GitHub Container Registry
uses: docker/login-action@v4
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
# Delete stale worker pods so the autoscaler recreates them with fresh images.
# Nodepools (and their underlying nodes) survive — this is the "warm start".
- name: Reset worker pods
run: |
export KUBECONFIG=~/.kube/coreweave-iris
kubectl delete pods -n "$IRIS_NAMESPACE" -l "$IRIS_MANAGED_LABEL=true" --grace-period=0 --ignore-not-found || true
# Rebuild images and (re)start the controller. `cluster start` is fully
# idempotent on K8s: it applies namespace/RBAC/ConfigMap/Deployment/Service
# and triggers a rollout restart, so both cold starts and warm restarts
# work without needing to tunnel to an existing controller first.
- name: Start controller
env:
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
run: |
cd lib/iris && uv run --group dev iris -v \
--config=config/coreweave-ci.yaml \
cluster start --fresh
- name: Connect to iris-ci controller
run: |
uv run python scripts/workflows/iris_monitor.py coreweave-controller \
--namespace "$IRIS_NAMESPACE" \
--kubeconfig "$HOME/.kube/coreweave-iris" \
--log-path "$RUNNER_TEMP/iris-cw-port-forward.log"
- name: Run integration tests
env:
WANDB_MODE: disabled
WANDB_API_KEY: ""
JAX_TRACEBACK_FILTERING: off
# When set, the marin-on-iris test uploads fixtures and writes
# intermediate data to S3 (R2) so remote Zephyr pods can access them.
MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
run: |
export KUBECONFIG=~/.kube/coreweave-iris
uv run pytest tests/integration/iris/ \
--controller-url "$IRIS_CONTROLLER_URL" \
-v --tb=short --timeout=600 \
-o "addopts=" \
-x
- name: Run full integration pipeline
env:
WANDB_MODE: disabled
WANDB_API_KEY: ""
JAX_TRACEBACK_FILTERING: off
MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
run: |
timeout 1800 uv run pytest tests/test_integration_test.py \
-m integration -o "addopts=" --timeout=900 -v -s
- name: Capture failure diagnostics
if: failure()
continue-on-error: true
env:
LOG_DIR: ${{ github.workspace }}/iris-cw-logs
run: |
mkdir -p "$LOG_DIR"
if [ -n "${PF_LOG:-}" ] && [ -f "$PF_LOG" ]; then
cp "$PF_LOG" "$LOG_DIR/port-forward.log"
fi
# No job submitted in this lane, so the iris.job_id selector matches
# nothing — continue-on-error tolerates the empty kubernetes-pods.json.
uv run python scripts/workflows/iris_monitor.py collect \
--job-id "ci-smoke" \
--controller-url "$IRIS_CONTROLLER_URL" \
--provider coreweave \
--output-dir "$LOG_DIR" \
--namespace "$IRIS_NAMESPACE" \
--managed-label "$IRIS_MANAGED_LABEL" \
--kubeconfig "$HOME/.kube/coreweave-iris"
- name: Upload failure diagnostics
if: failure()
uses: actions/upload-artifact@v4
with:
name: iris-cw-ci-logs
path: iris-cw-logs/
retention-days: 14
if-no-files-found: ignore
- name: Stop port-forward
if: always()
run: |
if [ -n "${PF_PID:-}" ]; then
kill "$PF_PID" 2>/dev/null || true
fi
pkill -f "kubectl.*$IRIS_NAMESPACE.*port-forward.*pod/iris-controller" 2>/dev/null || true
pkill -f "kubectl.*port-forward.*$IRIS_NAMESPACE.*pod/iris-controller" 2>/dev/null || true
- name: Set commit status to result
if: always() && github.event_name == 'workflow_dispatch'
env:
GH_TOKEN: ${{ github.token }}
run: |
sha=$(git rev-parse HEAD)
if [ "${{ job.status }}" = "success" ]; then
state=success
else
state=failure
fi
gh api repos/${{ github.repository }}/statuses/"$sha" \
-f state="$state" \
-f context="Iris CoreWeave CI" \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"