[iris] Label always-on CoreWeave nodes as system-critical #142
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Iris - Cloud Smoke Test (GCP) | |
| on: | |
| pull_request: | |
| paths: | |
| - "lib/iris/**" | |
| issue_comment: | |
| types: [created] | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| packages: write | |
| statuses: write # post commit status from issue_comment trigger | |
| concurrency: | |
| group: iris-cloud-smoke-${{ github.event.pull_request.number || github.event.issue.number || github.run_id }} | |
| # Never kill a running cloud test — would leak GCP resources. | |
| cancel-in-progress: false | |
| jobs: | |
| cloud-smoke-test: | |
| # Run on: PRs touching lib/iris/**, /iris-smoke comment, or manual dispatch | |
| if: >- | |
| github.event_name == 'pull_request' || | |
| github.event_name == 'workflow_dispatch' || | |
| ( | |
| github.event_name == 'issue_comment' && | |
| github.event.issue.pull_request && | |
| contains(github.event.comment.body, '/iris-smoke') && | |
| ( | |
| github.event.comment.author_association == 'MEMBER' || | |
| github.event.comment.author_association == 'COLLABORATOR' || | |
| github.event.comment.author_association == 'OWNER' | |
| ) | |
| ) | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 45 | |
| outputs: | |
| label-prefix: ${{ steps.label.outputs.prefix }} | |
| steps: | |
| - name: Compute label prefix | |
| id: label | |
| run: | | |
| if [ "${{ github.event_name }}" = "pull_request" ]; then | |
| echo "prefix=smoke-pr-${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT" | |
| elif [ "${{ github.event_name }}" = "issue_comment" ]; then | |
| echo "prefix=smoke-pr-${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT" | |
| else | |
| # workflow_dispatch: use short SHA | |
| echo "prefix=smoke-$(echo ${{ github.sha }} | head -c 8)" >> "$GITHUB_OUTPUT" | |
| fi | |
| - name: Set commit status to pending | |
| if: github.event_name == 'issue_comment' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| gh api repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }} \ | |
| -f state=pending \ | |
| -f context="Iris Cloud Smoke Test" \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| # pull_request uses default merge ref; issue_comment needs explicit PR head | |
| ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }} | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| cache-dependency-glob: "lib/iris/pyproject.toml" | |
| - name: Cache Playwright browsers | |
| id: playwright-cache | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.cache/ms-playwright | |
| key: playwright-${{ runner.os }}-${{ hashFiles('lib/iris/pyproject.toml') }} | |
| - name: Install Playwright browsers | |
| if: steps.playwright-cache.outputs.cache-hit != 'true' | |
| run: cd lib/iris && uv run playwright install --with-deps chromium | |
| - name: Install Playwright system deps | |
| if: steps.playwright-cache.outputs.cache-hit == 'true' | |
| run: cd lib/iris && uv run playwright install-deps chromium | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }} | |
| - name: Set up Google Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| with: | |
| project_id: ${{ secrets.GCP_PROJECT_ID }} | |
| - name: Write SSH key | |
| run: | | |
| mkdir -p ~/.ssh | |
| echo "${{ secrets.MARIN_SSH_KEY }}" > ~/.ssh/marin_ray_cluster.pem | |
| chmod 600 ~/.ssh/marin_ray_cluster.pem | |
| - name: Log in to GitHub Container Registry | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Start smoke cluster | |
| env: | |
| LABEL_PREFIX: ${{ steps.label.outputs.prefix }} | |
| MARIN_PREFIX: gs://marin-eu-west4 | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=examples/smoke-gcp.yaml \ | |
| cluster start-smoke \ | |
| --label-prefix "$LABEL_PREFIX" \ | |
| --url-file /tmp/iris-controller-url \ | |
| --wait-for-workers 1 \ | |
| --worker-timeout 600 & | |
| START_PID=$! | |
| echo "START_PID=$START_PID" >> "$GITHUB_ENV" | |
| for i in $(seq 1 360); do | |
| if [ -f /tmp/iris-controller-url ]; then | |
| echo "Cluster ready!" | |
| echo "IRIS_CONTROLLER_URL=$(cat /tmp/iris-controller-url)" >> "$GITHUB_ENV" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| if [ ! -f /tmp/iris-controller-url ]; then | |
| echo "Timed out waiting for cluster" | |
| kill $START_PID 2>/dev/null || true | |
| exit 1 | |
| fi | |
| - name: Run cloud smoke tests | |
| env: | |
| IRIS_SCREENSHOT_DIR: ${{ github.workspace }}/iris-cloud-screenshots | |
| PYTHONASYNCIODEBUG: "1" | |
| run: | | |
| mkdir -p ${{ github.workspace }}/iris-cloud-screenshots | |
| cd lib/iris && uv run --group dev python -m pytest \ | |
| tests/e2e/test_smoke.py \ | |
| -m e2e \ | |
| --iris-controller-url "$IRIS_CONTROLLER_URL" \ | |
| -o "addopts=" \ | |
| --tb=short -v \ | |
| --timeout=1200 | |
| - name: Stop tunnel | |
| if: always() | |
| run: | | |
| kill $START_PID 2>/dev/null || true | |
| - name: Tear down smoke cluster | |
| if: always() | |
| env: | |
| LABEL_PREFIX: ${{ steps.label.outputs.prefix }} | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=examples/smoke-gcp.yaml \ | |
| cluster stop --label "$LABEL_PREFIX" || true | |
| - name: Upload screenshots | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: iris-cloud-smoke-screenshots | |
| path: iris-cloud-screenshots/ | |
| retention-days: 14 | |
| if-no-files-found: ignore | |
| # Separate job so cleanup runs even if the test job times out or is killed. | |
| # Skipped when cloud-smoke-test was skipped (e.g. non-matching comment). | |
| cleanup: | |
| needs: cloud-smoke-test | |
| if: ${{ always() && needs.cloud-smoke-test.result != 'skipped' }} | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| env: | |
| LABEL_PREFIX: ${{ needs.cloud-smoke-test.outputs.label-prefix }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }} | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| cache-dependency-glob: "lib/iris/pyproject.toml" | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }} | |
| - name: Set up Google Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| with: | |
| project_id: ${{ secrets.GCP_PROJECT_ID }} | |
| - name: Tear down cluster (iris stop) | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=examples/smoke-gcp.yaml \ | |
| cluster stop --label "$LABEL_PREFIX" || true | |
| - name: Failsafe GCP resource cleanup | |
| run: | | |
| MANAGED_LABEL="iris-${LABEL_PREFIX}-managed" | |
| CONTROLLER_LABEL="iris-${LABEL_PREFIX}-controller" | |
| PROJECT="${{ secrets.GCP_PROJECT_ID }}" | |
| echo "Cleaning up GCE instances with label ${MANAGED_LABEL}=true OR ${CONTROLLER_LABEL}=true..." | |
| gcloud compute instances list \ | |
| --project="$PROJECT" \ | |
| --filter="labels.${MANAGED_LABEL}=true OR labels.${CONTROLLER_LABEL}=true" \ | |
| --format="csv[no-heading](name,zone)" 2>/dev/null \ | |
| | while IFS=, read -r name zone; do | |
| [ -z "$name" ] && continue | |
| echo "Deleting instance $name ($zone)" | |
| gcloud compute instances delete "$name" \ | |
| --project="$PROJECT" --zone="$zone" --quiet || true | |
| done | |
| echo "Cleaning up TPU VMs with label ${MANAGED_LABEL}=true..." | |
| gcloud compute tpus tpu-vm list \ | |
| --project="$PROJECT" --zone=- \ | |
| --filter="labels.${MANAGED_LABEL}=true" \ | |
| --uri 2>/dev/null \ | |
| | while read -r uri; do | |
| [ -z "$uri" ] && continue | |
| tpu_name=$(echo "$uri" | awk -F/ '{print $NF}') | |
| tpu_zone=$(echo "$uri" | awk -F/ '{print $(NF-2)}') | |
| echo "Deleting TPU $tpu_name ($tpu_zone)" | |
| gcloud compute tpus tpu-vm delete "$tpu_name" \ | |
| --project="$PROJECT" --zone="$tpu_zone" --quiet --async || true | |
| done | |
| - name: Set commit status to result | |
| if: github.event_name == 'issue_comment' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| sha=$(git rev-parse HEAD) | |
| if [ "${{ needs.cloud-smoke-test.result }}" = "success" ]; then | |
| state=success | |
| else | |
| state=failure | |
| fi | |
| gh api repos/${{ github.repository }}/statuses/"$sha" \ | |
| -f state="$state" \ | |
| -f context="Iris Cloud Smoke Test" \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" |