Skip to content

Commit c5572e1

Browse files
authored
Add CoreWeave CI workflow for Iris PRs (#4174)
- New workflow `.github/workflows/iris-coreweave-ci.yaml` runs the Iris integration test suite (`tests/integration/iris/`) against a persistent CoreWeave cluster on every PR touching `lib/iris/**` - New cluster config `lib/iris/examples/coreweave-ci.yaml` with CPU and GPU×8 scale groups pinned at min=max=1 slice for warm starts - Unlike the existing CW smoke/canary workflows, nodepools are **not** torn down between runs — only controller and worker pods are reset via `cluster controller restart` - Concurrency limit of 1 (shared warm cluster); triggered by PR paths, `/iris-ci-cw` comment, or manual dispatch
1 parent 0aaa3b2 commit c5572e1

File tree

11 files changed

+645
-155
lines changed

11 files changed

+645
-155
lines changed
Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
name: Iris - CoreWeave CI
2+
3+
on:
4+
pull_request:
5+
types: [opened, synchronize]
6+
paths:
7+
- "lib/iris/**"
8+
issue_comment:
9+
types: [created]
10+
workflow_dispatch:
11+
12+
permissions:
13+
contents: read
14+
packages: write
15+
pull-requests: read # needed for issue_comment to access PR metadata
16+
statuses: write # post commit status from issue_comment trigger
17+
18+
# Single concurrency group — only one CW CI run at a time across all PRs.
19+
# The warm cluster is shared; concurrent runs would conflict.
20+
concurrency:
21+
group: iris-coreweave-ci
22+
cancel-in-progress: false
23+
24+
jobs:
25+
cw-ci-test:
26+
if: >-
27+
(github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) ||
28+
github.event_name == 'workflow_dispatch' ||
29+
(
30+
github.event_name == 'issue_comment' &&
31+
github.event.issue.pull_request &&
32+
contains(github.event.comment.body, '/iris-ci-cw') &&
33+
(
34+
github.event.comment.author_association == 'MEMBER' ||
35+
github.event.comment.author_association == 'COLLABORATOR' ||
36+
github.event.comment.author_association == 'OWNER'
37+
)
38+
)
39+
runs-on: ubuntu-latest
40+
timeout-minutes: 60
41+
env:
42+
IRIS_NAMESPACE: iris-ci
43+
# Must match Labels(label_prefix).iris_managed from the cluster config
44+
IRIS_MANAGED_LABEL: iris-iris-ci-managed
45+
steps:
46+
- name: Checkout code
47+
uses: actions/checkout@v4
48+
with:
49+
ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }}
50+
51+
- name: Set commit status to pending
52+
if: github.event_name == 'issue_comment'
53+
env:
54+
GH_TOKEN: ${{ github.token }}
55+
run: |
56+
sha=$(git rev-parse HEAD)
57+
gh api repos/${{ github.repository }}/statuses/"$sha" \
58+
-f state=pending \
59+
-f context="Iris CoreWeave CI" \
60+
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true
61+
62+
- name: Set up Python 3.12
63+
uses: actions/setup-python@v5
64+
with:
65+
python-version: "3.12"
66+
67+
- name: Install uv
68+
uses: astral-sh/setup-uv@v7
69+
with:
70+
enable-cache: true
71+
cache-dependency-glob: "lib/iris/pyproject.toml"
72+
73+
- name: Write kubeconfig
74+
run: |
75+
mkdir -p ~/.kube
76+
echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris
77+
chmod 600 ~/.kube/coreweave-iris
78+
79+
- name: Log in to GitHub Container Registry
80+
uses: docker/login-action@v3
81+
with:
82+
registry: ghcr.io
83+
username: ${{ github.actor }}
84+
password: ${{ secrets.GITHUB_TOKEN }}
85+
86+
- name: Set up Docker Buildx
87+
uses: docker/setup-buildx-action@v3
88+
89+
# Delete stale worker pods so the autoscaler recreates them with fresh images.
90+
# Nodepools (and their underlying nodes) survive — this is the "warm start".
91+
- name: Reset worker pods
92+
run: |
93+
export KUBECONFIG=~/.kube/coreweave-iris
94+
kubectl delete pods -n "$IRIS_NAMESPACE" -l "$IRIS_MANAGED_LABEL=true" --grace-period=0 --ignore-not-found || true
95+
96+
# Rebuild images and (re)start the controller. `cluster start` is fully
97+
# idempotent on K8s: it applies namespace/RBAC/ConfigMap/Deployment/Service
98+
# and triggers a rollout restart, so both cold starts and warm restarts
99+
# work without needing to tunnel to an existing controller first.
100+
- name: Start controller
101+
env:
102+
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
103+
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
104+
run: |
105+
cd lib/iris && uv run --group dev iris -v \
106+
--config=examples/coreweave-ci.yaml \
107+
cluster start
108+
109+
- name: Run integration tests
110+
env:
111+
WANDB_MODE: disabled
112+
WANDB_API_KEY: ""
113+
JAX_TRACEBACK_FILTERING: off
114+
# When set, the marin-on-iris test uploads fixtures and writes
115+
# intermediate data to S3 (R2) so remote Zephyr pods can access them.
116+
MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
117+
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
118+
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
119+
AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
120+
FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
121+
run: |
122+
export KUBECONFIG=~/.kube/coreweave-iris
123+
kubectl port-forward -n "$IRIS_NAMESPACE" svc/iris-ci-controller-svc 10000:10000 &
124+
PF_PID=$!
125+
echo "PF_PID=$PF_PID" >> "$GITHUB_ENV"
126+
127+
IRIS_CONTROLLER_URL="http://localhost:10000"
128+
129+
# Controller deployment is already confirmed ready by `cluster start`;
130+
# this just waits for the port-forward to be usable.
131+
HEALTHY=false
132+
for i in $(seq 1 60); do
133+
if ! kill -0 "$PF_PID" 2>/dev/null; then
134+
echo "port-forward process died unexpectedly"
135+
exit 1
136+
fi
137+
if curl -sf "$IRIS_CONTROLLER_URL/health" > /dev/null 2>&1; then
138+
HEALTHY=true
139+
break
140+
fi
141+
sleep 5
142+
done
143+
if [ "$HEALTHY" != "true" ]; then
144+
echo "Controller did not become healthy within timeout"
145+
exit 1
146+
fi
147+
148+
uv run pytest tests/integration/iris/ \
149+
--controller-url "$IRIS_CONTROLLER_URL" \
150+
-v --tb=short --timeout=600 \
151+
-o "addopts=" \
152+
-x
153+
154+
- name: Run full integration pipeline
155+
env:
156+
WANDB_MODE: disabled
157+
WANDB_API_KEY: ""
158+
JAX_TRACEBACK_FILTERING: off
159+
MARIN_CI_S3_PREFIX: s3://marin-na/temp/ci
160+
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
161+
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
162+
AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
163+
FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
164+
run: |
165+
IRIS_CONTROLLER_URL="http://localhost:10000"
166+
timeout 600 uv run tests/integration/iris/run_iris_full_integration.py \
167+
--controller-url "$IRIS_CONTROLLER_URL"
168+
169+
- name: Stop port-forward
170+
if: always()
171+
run: |
172+
[ -n "$PF_PID" ] && kill "$PF_PID" 2>/dev/null || true
173+
pkill -f "kubectl port-forward.*$IRIS_NAMESPACE" 2>/dev/null || true
174+
175+
- name: Capture failure diagnostics
176+
if: failure()
177+
run: |
178+
export KUBECONFIG=~/.kube/coreweave-iris
179+
echo "=== Controller logs ==="
180+
kubectl -n "$IRIS_NAMESPACE" logs -l app=iris-controller --tail=500 || true
181+
echo "=== Controller pod describe ==="
182+
kubectl -n "$IRIS_NAMESPACE" describe pod -l app=iris-controller || true
183+
echo "=== Worker pods ==="
184+
kubectl -n "$IRIS_NAMESPACE" get pods -l "$IRIS_MANAGED_LABEL=true" || true
185+
echo "=== Warning events ==="
186+
kubectl -n "$IRIS_NAMESPACE" get events --sort-by='.lastTimestamp' --field-selector type!=Normal || true
187+
188+
- name: Set commit status to result
189+
if: always() && github.event_name == 'issue_comment'
190+
env:
191+
GH_TOKEN: ${{ github.token }}
192+
run: |
193+
sha=$(git rev-parse HEAD)
194+
if [ "${{ job.status }}" = "success" ]; then
195+
state=success
196+
else
197+
state=failure
198+
fi
199+
gh api repos/${{ github.repository }}/statuses/"$sha" \
200+
-f state="$state" \
201+
-f context="Iris CoreWeave CI" \
202+
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"

.github/workflows/iris-integration.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,14 +69,23 @@ jobs:
6969
run: |
7070
uv run pytest tests/integration/iris/ \
7171
--controller-url "$IRIS_CONTROLLER_URL" \
72-
-v --tb=short --timeout=600 \
72+
-v -s --log-cli-level=INFO --tb=short --timeout=600 \
7373
-o "addopts=" \
7474
-x
7575
env:
7676
WANDB_MODE: disabled
7777
WANDB_API_KEY: ""
7878
JAX_TRACEBACK_FILTERING: off
7979

80+
- name: Run full integration pipeline
81+
run: |
82+
timeout 600 uv run tests/integration/iris/run_iris_full_integration.py \
83+
--controller-url "$IRIS_CONTROLLER_URL"
84+
env:
85+
WANDB_MODE: disabled
86+
WANDB_API_KEY: ""
87+
JAX_TRACEBACK_FILTERING: off
88+
8089
- name: Stop cluster
8190
if: always()
8291
run: kill $CLUSTER_PID 2>/dev/null || true
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
# Persistent CoreWeave CI cluster. Both scale groups are pinned at min=max=1
2+
# so nodes stay warm between runs — only controller and worker pods are reset.
3+
4+
platform:
5+
label_prefix: iris-ci
6+
coreweave:
7+
region: US-WEST-04A
8+
namespace: iris-ci
9+
kubeconfig_path: ~/.kube/coreweave-iris
10+
object_storage_endpoint: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
11+
12+
storage:
13+
remote_state_dir: s3://marin-na/iris/state/ci
14+
15+
kubernetes_provider:
16+
namespace: iris-ci
17+
default_image: ghcr.io/marin-community/iris-task:latest
18+
host_network: true
19+
cache_dir: /mnt/local/iris-cache
20+
controller_address: http://iris-ci-controller-svc.iris-ci.svc.cluster.local:10000
21+
22+
controller:
23+
image: ghcr.io/marin-community/iris-controller:latest
24+
coreweave:
25+
port: 10000
26+
service_name: iris-ci-controller-svc
27+
scale_group: cpu-erapids
28+
29+
defaults:
30+
autoscaler:
31+
evaluation_interval:
32+
milliseconds: 10000
33+
scale_up_delay:
34+
milliseconds: 60000
35+
scale_down_delay:
36+
milliseconds: 300000
37+
startup_grace_period:
38+
milliseconds: 1200000 # 20 min — nodes are pinned warm so this rarely fires
39+
task_env:
40+
MARIN_PREFIX: s3://marin-na/marin
41+
worker:
42+
docker_image: ghcr.io/marin-community/iris-worker:latest
43+
port: 10001
44+
cache_dir: /mnt/local/iris-cache
45+
runtime: kubernetes
46+
default_task_image: ghcr.io/marin-community/iris-task:latest
47+
48+
scale_groups:
49+
cpu-erapids:
50+
num_vms: 1
51+
resources:
52+
cpu: 64
53+
ram: 256GB
54+
disk: 1TB
55+
device_type: cpu
56+
preemptible: false
57+
worker:
58+
attributes:
59+
region: US-WEST-04A
60+
pool: cpu-erapids
61+
min_slices: 1
62+
max_slices: 1
63+
priority: 50
64+
slice_template:
65+
num_vms: 1
66+
coreweave:
67+
region: US-WEST-04A
68+
instance_type: cd-gp-i64-erapids
69+
70+
h100-8x:
71+
num_vms: 1
72+
resources:
73+
cpu: 128
74+
ram: 2048GB
75+
disk: 1TB
76+
device_type: gpu
77+
device_variant: H100
78+
device_count: 8
79+
preemptible: false
80+
worker:
81+
attributes:
82+
region: US-WEST-04A
83+
pool: h100-8x
84+
min_slices: 1
85+
max_slices: 1
86+
priority: 100
87+
slice_template:
88+
num_vms: 1
89+
coreweave:
90+
region: US-WEST-04A
91+
instance_type: gd-8xh100ib-i128

lib/iris/src/iris/cluster/controller/service.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1870,6 +1870,18 @@ def exec_in_container(
18701870

18711871
task_worker_id = task.worker_id
18721872
if not task_worker_id:
1873+
if self._controller.has_direct_provider:
1874+
provider = self._controller.provider
1875+
timeout = request.timeout_seconds if request.timeout_seconds else 60
1876+
resp = provider.exec_in_container(
1877+
task.task_id.to_wire(), task.current_attempt_id, list(request.command), timeout
1878+
)
1879+
return cluster_pb2.Controller.ExecInContainerResponse(
1880+
exit_code=resp.exit_code,
1881+
stdout=resp.stdout,
1882+
stderr=resp.stderr,
1883+
error=resp.error,
1884+
)
18731885
raise ConnectError(Code.FAILED_PRECONDITION, f"Task {request.task_id} not assigned to a worker")
18741886

18751887
worker = _read_worker(self._db, task_worker_id)

lib/iris/src/iris/cluster/providers/k8s/tasks.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,26 @@ def profile_task(
693693
except Exception as e:
694694
return cluster_pb2.ProfileTaskResponse(error=str(e))
695695

696+
def exec_in_container(
697+
self,
698+
task_id: str,
699+
attempt_id: int,
700+
command: list[str],
701+
timeout_seconds: int = 60,
702+
) -> cluster_pb2.Worker.ExecInContainerResponse:
703+
"""Execute a command in a running task pod via kubectl exec."""
704+
pod_name = _pod_name(JobName.from_wire(task_id), attempt_id)
705+
effective_timeout: float | None = timeout_seconds if timeout_seconds >= 0 else None
706+
try:
707+
result = self.kubectl.exec(pod_name, command, container="task", timeout=effective_timeout)
708+
return cluster_pb2.Worker.ExecInContainerResponse(
709+
exit_code=result.returncode,
710+
stdout=result.stdout,
711+
stderr=result.stderr,
712+
)
713+
except Exception as e:
714+
return cluster_pb2.Worker.ExecInContainerResponse(error=str(e))
715+
696716
def close(self) -> None:
697717
"""No persistent resources to release."""
698718

lib/marin/src/marin/processing/classification/classifier.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def load_model(self):
8181

8282
with FileLock(lock_file):
8383
if not os.path.exists(success_file):
84-
fs.makedirs(f"/tmp/{model_descriptor}", exist_ok=True)
84+
os.makedirs(f"/tmp/{model_descriptor}", exist_ok=True)
8585

8686
if is_remote_or_local_path:
8787
fs.get(fs_path, local_filepath)

0 commit comments

Comments
 (0)