Tolerate controller unavailability for up to 1h in job monitoring #318
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Iris - Integration Tests | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| workflow_dispatch: | |
| jobs: | |
| iris-itest: | |
| if: github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 45 | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| strategy: | |
| matrix: | |
| python-version: ["3.12"] | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "22" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| - name: Install dependencies | |
| run: uv sync --all-packages --extra=cpu --extra=dedup --no-default-groups --group dev | |
| - name: Start local Iris cluster | |
| run: | | |
| uv run iris --config lib/iris/examples/test.yaml \ | |
| cluster start --local > /tmp/iris-cluster.log 2>&1 & | |
| CLUSTER_PID=$! | |
| echo "CLUSTER_PID=$CLUSTER_PID" >> "$GITHUB_ENV" | |
| # Wait for controller to print its URL | |
| for i in $(seq 1 120); do | |
| if grep -q "Controller started at" /tmp/iris-cluster.log 2>/dev/null; then | |
| URL=$(grep "Controller started at" /tmp/iris-cluster.log | head -1 | sed -n 's/.*Controller started at //p') | |
| echo "IRIS_CONTROLLER_URL=$URL" >> "$GITHUB_ENV" | |
| echo "Cluster ready at $URL" | |
| break | |
| fi | |
| sleep 1 | |
| done | |
| if [ -z "${URL:-}" ]; then | |
| echo "Cluster failed to start within timeout" | |
| cat /tmp/iris-cluster.log | |
| exit 1 | |
| fi | |
| - name: Run integration tests | |
| run: | | |
| uv run pytest tests/integration/iris/ \ | |
| --controller-url "$IRIS_CONTROLLER_URL" \ | |
| -v --tb=short --timeout=600 \ | |
| -o "addopts=" \ | |
| -x | |
| env: | |
| WANDB_MODE: disabled | |
| WANDB_API_KEY: "" | |
| JAX_TRACEBACK_FILTERING: off | |
| - name: Stop cluster | |
| if: always() | |
| run: kill $CLUSTER_PID 2>/dev/null || true | |
| - name: Show cluster logs on failure | |
| if: failure() | |
| run: cat /tmp/iris-cluster.log || true |