Skip to content

Commit 40e50a5

Browse files
authored
Remove Ray from integration tests; consolidate on Iris (#4601)
Replace the Ray-backed marin-itest with the existing Iris-backed pipeline test, fold the iris-integration workflow into marin-itest (which now starts its own local Iris controller), and drop the Ray fixtures and tests from lib/fray/tests. Part of the Ray deprecation effort.
1 parent cda5a74 commit 40e50a5

11 files changed

Lines changed: 211 additions & 1479 deletions

File tree

.github/workflows/fray-unit-tests.yaml

Lines changed: 0 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -49,51 +49,5 @@ jobs:
4949

5050
- name: Test fray
5151
run: |
52-
# NOTE: do not use frozen here because it makes ray sad when it sets up venvs with uv
5352
cd lib/fray && uv run --group=fray-test pytest --durations=5 --tb=short -m 'not slow and not tpu_ci' -v -s tests/
5453
55-
fray-tpu-tests:
56-
needs: changes
57-
runs-on: [tpu-ci]
58-
timeout-minutes: 10
59-
if: needs.changes.outputs.should_run == 'true' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository)
60-
steps:
61-
- name: Checkout code
62-
uses: actions/checkout@v3
63-
with:
64-
ref: ${{ (github.event_name == 'pull_request_review' && format('refs/pull/{0}/merge', github.event.pull_request.number)) || '' }}
65-
66-
- name: Run TPU tests in Docker
67-
env:
68-
HF_TOKEN: ${{ secrets.HF_TOKEN }}
69-
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
70-
run: |
71-
bash infra/tpu-ci/clean-tpu.sh
72-
73-
DOCKER_IMAGE="ghcr.io/marin-community/marin/tpu-ci:latest"
74-
echo "Using Docker image: $DOCKER_IMAGE"
75-
76-
# Create UV cache directory
77-
mkdir -p /tmp/uv-cache
78-
chmod 777 /tmp/uv-cache
79-
80-
docker run --rm \
81-
--device /dev/vfio:/dev/vfio \
82-
--shm-size=100g \
83-
--stop-timeout=5 \
84-
--cap-add=SYS_RESOURCE \
85-
--ulimit memlock=68719476736:68719476736 \
86-
-e HF_TOKEN \
87-
-e JAX_COORDINATOR_ADDRESS=127.0.0.1 \
88-
-e JAX_PLATFORMS=tpu,cpu \
89-
-e PJRT_DEVICE=TPU \
90-
-e TPU_MIN_LOG_LEVEL=3 \
91-
-e TPU_STDERR_LOG_LEVEL=3 \
92-
-e UV_CACHE_DIR=/tmp/uv-cache \
93-
-e WANDB_API_KEY \
94-
-e WANDB_MODE=offline \
95-
-v ${{ github.workspace }}:/workspace-src:ro \
96-
-v /tmp/uv-cache:/tmp/uv-cache:rw \
97-
-w /workspace \
98-
$DOCKER_IMAGE \
99-
bash -c "cp -a /workspace-src/. /workspace/ && cd /workspace/lib/fray && timeout --kill-after=5 --signal=TERM 590 uv run --group=fray-tpu-test pytest tests/ -v --tb=short -s --log-cli-level=INFO -m tpu_ci"

.github/workflows/iris-coreweave-ci.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ jobs:
174174
FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}'
175175
run: |
176176
IRIS_CONTROLLER_URL="http://localhost:${LOCAL_PORT}"
177-
timeout 600 uv run tests/integration/iris/run_iris_full_integration.py \
177+
timeout 600 uv run tests/integration_test.py \
178178
--controller-url "$IRIS_CONTROLLER_URL"
179179
180180
- name: Stop port-forward

.github/workflows/iris-integration.yaml

Lines changed: 0 additions & 95 deletions
This file was deleted.

.github/workflows/marin-itest.yaml

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,18 +12,16 @@ jobs:
1212
marin-itest:
1313
if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository
1414
runs-on: ubuntu-latest
15-
# uv has to resolve + download large binary wheels (jax/torch). 10 minutes wasn't
16-
# enough for cache misses on GitHub's runners, so give ourselves more runway.
17-
timeout-minutes: 25
15+
timeout-minutes: 45
1816
concurrency:
1917
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
2018
cancel-in-progress: true
2119

2220
steps:
2321
- name: Checkout code
24-
uses: actions/checkout@v3
22+
uses: actions/checkout@v4
2523
- name: Set up Python 3.12
26-
uses: actions/setup-python@v4
24+
uses: actions/setup-python@v5
2725
with:
2826
python-version: "3.12"
2927

@@ -41,21 +39,61 @@ jobs:
4139
# `--no-default-groups` keeps uv from also installing every workspace package's
4240
# dev/docs/test groups (e.g. `levanter[docs,test,dev]`), which saves multiple
4341
# minutes and avoids re-downloading CUDA wheels unnecessarily on cold caches.
44-
run: uv sync --all-packages --extra=cpu --extra=dedup --no-default-groups
42+
run: uv sync --all-packages --extra=cpu --extra=dedup --no-default-groups --group dev
4543

4644
- name: Check df -h
4745
run: df -h
4846

49-
- name: Give Ray tmp space
50-
run: sudo mkdir -p /mnt/ray && sudo chmod 777 /mnt/ray
47+
- name: Start local Iris cluster
48+
run: |
49+
uv run iris --config lib/iris/examples/test.yaml \
50+
cluster start --local > /tmp/iris-cluster.log 2>&1 &
51+
CLUSTER_PID=$!
52+
echo "CLUSTER_PID=$CLUSTER_PID" >> "$GITHUB_ENV"
5153
52-
- name: Run the quickstart script
53-
shell: bash -l {0}
54-
# N.B. You _must not_ use `uv run` here, as that triggers weird behavior from Ray
55-
# https://github.com/ray-project/ray/issues/54344
56-
run: .venv/bin/python tests/integration_test.py
54+
# Wait for controller to print its URL
55+
for i in $(seq 1 120); do
56+
if grep -q "Controller started at" /tmp/iris-cluster.log 2>/dev/null; then
57+
URL=$(grep "Controller started at" /tmp/iris-cluster.log | head -1 | sed -n 's/.*Controller started at //p')
58+
echo "IRIS_CONTROLLER_URL=$URL" >> "$GITHUB_ENV"
59+
echo "Cluster ready at $URL"
60+
break
61+
fi
62+
sleep 1
63+
done
64+
65+
if [ -z "${URL:-}" ]; then
66+
echo "Cluster failed to start within timeout"
67+
cat /tmp/iris-cluster.log
68+
exit 1
69+
fi
70+
71+
- name: Run iris integration tests
72+
run: |
73+
uv run pytest tests/integration/iris/ \
74+
--controller-url "$IRIS_CONTROLLER_URL" \
75+
-v -s --log-cli-level=INFO --tb=short --timeout=600 \
76+
-o "addopts=" \
77+
-x
78+
env:
79+
WANDB_MODE: disabled
80+
WANDB_API_KEY: ""
81+
JAX_TRACEBACK_FILTERING: off
82+
83+
- name: Run full marin integration pipeline
84+
run: |
85+
timeout 600 uv run tests/integration_test.py \
86+
--controller-url "$IRIS_CONTROLLER_URL"
5787
env:
5888
HF_TOKEN: ${{ secrets.HF_TOKEN }}
89+
WANDB_MODE: disabled
90+
WANDB_API_KEY: ""
5991
JAX_TRACEBACK_FILTERING: off
60-
WANDB_MODE: offline
61-
RAY_TMPDIR: /mnt/ray/
92+
93+
- name: Stop cluster
94+
if: always()
95+
run: kill $CLUSTER_PID 2>/dev/null || true
96+
97+
- name: Show cluster logs on failure
98+
if: failure()
99+
run: cat /tmp/iris-cluster.log || true

lib/fray/tests/conftest.py

Lines changed: 3 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,38 +4,15 @@
44
"""Pytest fixtures for fray tests."""
55

66

7-
import logging
8-
97
import pytest
10-
import ray
118
from fray.v1.cluster.local_cluster import LocalCluster, LocalClusterConfig
129

1310

14-
@pytest.fixture(scope="module")
15-
def ray_cluster():
16-
from fray.v1.cluster.ray import RayCluster
17-
18-
if not ray.is_initialized():
19-
logging.info("Initializing Ray cluster")
20-
ray.init(
21-
address="local",
22-
num_cpus=8,
23-
ignore_reinit_error=True,
24-
logging_level="info",
25-
log_to_driver=True,
26-
resources={"head_node": 1},
27-
)
28-
yield RayCluster()
29-
30-
3111
@pytest.fixture(scope="module")
3212
def local_cluster():
3313
yield LocalCluster(LocalClusterConfig(use_isolated_env=False))
3414

3515

36-
@pytest.fixture(scope="module", params=["local", "ray"])
37-
def cluster(request, local_cluster, ray_cluster):
38-
if request.param == "local":
39-
return local_cluster
40-
elif request.param == "ray":
41-
return ray_cluster
16+
@pytest.fixture(scope="module")
17+
def cluster(local_cluster):
18+
return local_cluster

lib/fray/tests/test_job_context.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,15 @@
44
"""Tests for execution contexts."""
55

66
import pytest
7-
from fray.v1.job import RayContext, SimpleActor, SyncContext, ThreadContext, create_job_ctx
7+
from fray.v1.job import SimpleActor, SyncContext, ThreadContext, create_job_ctx
88
from fray.v1.job.context import _apply_default_jax_platforms
99

1010

11-
@pytest.fixture(params=["sync", "threadpool", "ray"])
12-
def job_context(request, ray_cluster):
11+
@pytest.fixture(params=["sync", "threadpool"])
12+
def job_context(request):
1313
if request.param == "sync":
1414
return SyncContext()
15-
elif request.param == "threadpool":
16-
return ThreadContext(max_workers=2)
17-
18-
return RayContext()
15+
return ThreadContext(max_workers=2)
1916

2017

2118
def test_context_put_get(job_context):

0 commit comments

Comments
 (0)