[iris] k8s: bulk pod-metrics query to cut controller load #2881
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: "Iris - Smoke - CoreWeave" | |
| on: | |
| pull_request: | |
| types: [opened, synchronize] | |
| paths: | |
| - "lib/iris/**" | |
| - ".github/workflows/iris-smoke-coreweave.yaml" | |
| - "scripts/ci/iris_monitor.py" | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| packages: write | |
| statuses: write # post commit status from workflow_dispatch trigger | |
| # Shared concurrency group with marin-canary-ferry-coreweave.yaml — both rebuild/roll | |
| # the shared iris-ci controller and submit against the shared H100 in | |
| # US-WEST-04A. Only one run cluster-wide at a time. cancel-in-progress=false | |
| # so a mid-flight canary is not killed by a PR firing. | |
| concurrency: | |
| group: iris-coreweave-ci-shared | |
| cancel-in-progress: false | |
| jobs: | |
| cw-ci-test: | |
| if: >- | |
| (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) || | |
| github.event_name == 'workflow_dispatch' | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 60 | |
| env: | |
| IRIS_NAMESPACE: iris-ci | |
| # Must match Labels(label_prefix).iris_managed from the cluster config | |
| IRIS_MANAGED_LABEL: iris-iris-ci-managed | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v5 | |
| - name: Set commit status to pending | |
| if: github.event_name == 'workflow_dispatch' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| sha=$(git rev-parse HEAD) | |
| gh api repos/${{ github.repository }}/statuses/"$sha" \ | |
| -f state=pending \ | |
| -f context="Iris CoreWeave CI" \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| cache-dependency-glob: "lib/iris/pyproject.toml" | |
| - name: Write kubeconfig | |
| run: | | |
| mkdir -p ~/.kube | |
| echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris | |
| chmod 600 ~/.kube/coreweave-iris | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v4 | |
| - name: Log in to GitHub Container Registry | |
| uses: docker/login-action@v4 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| # Delete stale worker pods so the autoscaler recreates them with fresh images. | |
| # Nodepools (and their underlying nodes) survive — this is the "warm start". | |
| - name: Reset worker pods | |
| run: | | |
| export KUBECONFIG=~/.kube/coreweave-iris | |
| kubectl delete pods -n "$IRIS_NAMESPACE" -l "$IRIS_MANAGED_LABEL=true" --grace-period=0 --ignore-not-found || true | |
| # Rebuild images and (re)start the controller. `cluster start` is fully | |
| # idempotent on K8s: it applies namespace/RBAC/ConfigMap/Deployment/Service | |
| # and triggers a rollout restart, so both cold starts and warm restarts | |
| # work without needing to tunnel to an existing controller first. | |
| - name: Start controller | |
| env: | |
| R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=config/coreweave-ci.yaml \ | |
| cluster start --fresh | |
| - name: Connect to iris-ci controller | |
| run: | | |
| uv run python scripts/ci/iris_monitor.py coreweave-controller \ | |
| --namespace "$IRIS_NAMESPACE" \ | |
| --kubeconfig "$HOME/.kube/coreweave-iris" \ | |
| --log-path "$RUNNER_TEMP/iris-cw-port-forward.log" | |
| - name: Run integration tests | |
| env: | |
| WANDB_MODE: disabled | |
| WANDB_API_KEY: "" | |
| JAX_TRACEBACK_FILTERING: off | |
| # When set, the marin-on-iris test uploads fixtures and writes | |
| # intermediate data to S3 (R2) so remote Zephyr pods can access them. | |
| MARIN_CI_S3_PREFIX: s3://marin-na/tmp/ttl=3d/ci | |
| AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com | |
| FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}' | |
| run: | | |
| export KUBECONFIG=~/.kube/coreweave-iris | |
| uv run pytest tests/integration/iris/ \ | |
| --controller-url "$IRIS_CONTROLLER_URL" \ | |
| -v --tb=short --timeout=600 \ | |
| -o "addopts=" \ | |
| -x | |
| - name: Run full integration pipeline | |
| env: | |
| WANDB_MODE: disabled | |
| WANDB_API_KEY: "" | |
| JAX_TRACEBACK_FILTERING: off | |
| MARIN_CI_S3_PREFIX: s3://marin-na/tmp/ttl=3d/ci | |
| AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com | |
| FSSPEC_S3: '{"endpoint_url": "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com"}' | |
| run: | | |
| timeout 1800 uv run pytest tests/test_integration_test.py \ | |
| -m integration -o "addopts=" --timeout=900 -v -s | |
| - name: Capture failure diagnostics | |
| if: failure() | |
| continue-on-error: true | |
| env: | |
| LOG_DIR: ${{ github.workspace }}/iris-cw-logs | |
| run: | | |
| mkdir -p "$LOG_DIR" | |
| if [ -n "${PF_LOG:-}" ] && [ -f "$PF_LOG" ]; then | |
| cp "$PF_LOG" "$LOG_DIR/port-forward.log" | |
| fi | |
| # No job submitted in this lane, so the iris.job_id selector matches | |
| # nothing — continue-on-error tolerates the empty kubernetes-pods.json. | |
| uv run python scripts/ci/iris_monitor.py collect \ | |
| --job-id "ci-smoke" \ | |
| --controller-url "$IRIS_CONTROLLER_URL" \ | |
| --provider coreweave \ | |
| --output-dir "$LOG_DIR" \ | |
| --namespace "$IRIS_NAMESPACE" \ | |
| --managed-label "$IRIS_MANAGED_LABEL" \ | |
| --kubeconfig "$HOME/.kube/coreweave-iris" | |
| - name: Upload failure diagnostics | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: iris-cw-ci-logs | |
| path: iris-cw-logs/ | |
| retention-days: 14 | |
| if-no-files-found: ignore | |
| - name: Stop port-forward | |
| if: always() | |
| run: | | |
| if [ -n "${PF_PID:-}" ]; then | |
| kill "$PF_PID" 2>/dev/null || true | |
| fi | |
| pkill -f "kubectl.*$IRIS_NAMESPACE.*port-forward.*pod/iris-controller" 2>/dev/null || true | |
| pkill -f "kubectl.*port-forward.*$IRIS_NAMESPACE.*pod/iris-controller" 2>/dev/null || true | |
| - name: Set commit status to result | |
| if: always() && github.event_name == 'workflow_dispatch' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| sha=$(git rev-parse HEAD) | |
| if [ "${{ job.status }}" = "success" ]; then | |
| state=success | |
| else | |
| state=failure | |
| fi | |
| gh api repos/${{ github.repository }}/statuses/"$sha" \ | |
| -f state="$state" \ | |
| -f context="Iris CoreWeave CI" \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" |