Skip to content

[iris] Label always-on CoreWeave nodes as system-critical #142

[iris] Label always-on CoreWeave nodes as system-critical

[iris] Label always-on CoreWeave nodes as system-critical #142

name: Iris - Cloud Smoke Test (GCP)
on:
pull_request:
paths:
- "lib/iris/**"
issue_comment:
types: [created]
workflow_dispatch:
permissions:
contents: read
packages: write
statuses: write # post commit status from issue_comment trigger
concurrency:
group: iris-cloud-smoke-${{ github.event.pull_request.number || github.event.issue.number || github.run_id }}
# Never kill a running cloud test — would leak GCP resources.
cancel-in-progress: false
jobs:
cloud-smoke-test:
# Run on: PRs touching lib/iris/**, /iris-smoke comment, or manual dispatch
if: >-
github.event_name == 'pull_request' ||
github.event_name == 'workflow_dispatch' ||
(
github.event_name == 'issue_comment' &&
github.event.issue.pull_request &&
contains(github.event.comment.body, '/iris-smoke') &&
(
github.event.comment.author_association == 'MEMBER' ||
github.event.comment.author_association == 'COLLABORATOR' ||
github.event.comment.author_association == 'OWNER'
)
)
runs-on: ubuntu-latest
timeout-minutes: 45
outputs:
label-prefix: ${{ steps.label.outputs.prefix }}
steps:
- name: Compute label prefix
id: label
run: |
if [ "${{ github.event_name }}" = "pull_request" ]; then
echo "prefix=smoke-pr-${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT"
elif [ "${{ github.event_name }}" = "issue_comment" ]; then
echo "prefix=smoke-pr-${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT"
else
# workflow_dispatch: use short SHA
echo "prefix=smoke-$(echo ${{ github.sha }} | head -c 8)" >> "$GITHUB_OUTPUT"
fi
- name: Set commit status to pending
if: github.event_name == 'issue_comment'
env:
GH_TOKEN: ${{ github.token }}
run: |
gh api repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }} \
-f state=pending \
-f context="Iris Cloud Smoke Test" \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true
- name: Checkout code
uses: actions/checkout@v4
with:
# pull_request uses default merge ref; issue_comment needs explicit PR head
ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }}
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "lib/iris/pyproject.toml"
- name: Cache Playwright browsers
id: playwright-cache
uses: actions/cache@v4
with:
path: ~/.cache/ms-playwright
key: playwright-${{ runner.os }}-${{ hashFiles('lib/iris/pyproject.toml') }}
- name: Install Playwright browsers
if: steps.playwright-cache.outputs.cache-hit != 'true'
run: cd lib/iris && uv run playwright install --with-deps chromium
- name: Install Playwright system deps
if: steps.playwright-cache.outputs.cache-hit == 'true'
run: cd lib/iris && uv run playwright install-deps chromium
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }}
- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v2
with:
project_id: ${{ secrets.GCP_PROJECT_ID }}
- name: Write SSH key
run: |
mkdir -p ~/.ssh
echo "${{ secrets.MARIN_SSH_KEY }}" > ~/.ssh/marin_ray_cluster.pem
chmod 600 ~/.ssh/marin_ray_cluster.pem
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Start smoke cluster
env:
LABEL_PREFIX: ${{ steps.label.outputs.prefix }}
MARIN_PREFIX: gs://marin-eu-west4
run: |
cd lib/iris && uv run --group dev iris -v \
--config=examples/smoke-gcp.yaml \
cluster start-smoke \
--label-prefix "$LABEL_PREFIX" \
--url-file /tmp/iris-controller-url \
--wait-for-workers 1 \
--worker-timeout 600 &
START_PID=$!
echo "START_PID=$START_PID" >> "$GITHUB_ENV"
for i in $(seq 1 360); do
if [ -f /tmp/iris-controller-url ]; then
echo "Cluster ready!"
echo "IRIS_CONTROLLER_URL=$(cat /tmp/iris-controller-url)" >> "$GITHUB_ENV"
break
fi
sleep 2
done
if [ ! -f /tmp/iris-controller-url ]; then
echo "Timed out waiting for cluster"
kill $START_PID 2>/dev/null || true
exit 1
fi
- name: Run cloud smoke tests
env:
IRIS_SCREENSHOT_DIR: ${{ github.workspace }}/iris-cloud-screenshots
PYTHONASYNCIODEBUG: "1"
run: |
mkdir -p ${{ github.workspace }}/iris-cloud-screenshots
cd lib/iris && uv run --group dev python -m pytest \
tests/e2e/test_smoke.py \
-m e2e \
--iris-controller-url "$IRIS_CONTROLLER_URL" \
-o "addopts=" \
--tb=short -v \
--timeout=1200
- name: Stop tunnel
if: always()
run: |
kill $START_PID 2>/dev/null || true
- name: Tear down smoke cluster
if: always()
env:
LABEL_PREFIX: ${{ steps.label.outputs.prefix }}
run: |
cd lib/iris && uv run --group dev iris -v \
--config=examples/smoke-gcp.yaml \
cluster stop --label "$LABEL_PREFIX" || true
- name: Upload screenshots
if: always()
uses: actions/upload-artifact@v4
with:
name: iris-cloud-smoke-screenshots
path: iris-cloud-screenshots/
retention-days: 14
if-no-files-found: ignore
# Separate job so cleanup runs even if the test job times out or is killed.
# Skipped when cloud-smoke-test was skipped (e.g. non-matching comment).
cleanup:
needs: cloud-smoke-test
if: ${{ always() && needs.cloud-smoke-test.result != 'skipped' }}
runs-on: ubuntu-latest
timeout-minutes: 10
env:
LABEL_PREFIX: ${{ needs.cloud-smoke-test.outputs.label-prefix }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }}
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
cache-dependency-glob: "lib/iris/pyproject.toml"
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v2
with:
credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }}
- name: Set up Google Cloud SDK
uses: google-github-actions/setup-gcloud@v2
with:
project_id: ${{ secrets.GCP_PROJECT_ID }}
- name: Tear down cluster (iris stop)
run: |
cd lib/iris && uv run --group dev iris -v \
--config=examples/smoke-gcp.yaml \
cluster stop --label "$LABEL_PREFIX" || true
- name: Failsafe GCP resource cleanup
run: |
MANAGED_LABEL="iris-${LABEL_PREFIX}-managed"
CONTROLLER_LABEL="iris-${LABEL_PREFIX}-controller"
PROJECT="${{ secrets.GCP_PROJECT_ID }}"
echo "Cleaning up GCE instances with label ${MANAGED_LABEL}=true OR ${CONTROLLER_LABEL}=true..."
gcloud compute instances list \
--project="$PROJECT" \
--filter="labels.${MANAGED_LABEL}=true OR labels.${CONTROLLER_LABEL}=true" \
--format="csv[no-heading](name,zone)" 2>/dev/null \
| while IFS=, read -r name zone; do
[ -z "$name" ] && continue
echo "Deleting instance $name ($zone)"
gcloud compute instances delete "$name" \
--project="$PROJECT" --zone="$zone" --quiet || true
done
echo "Cleaning up TPU VMs with label ${MANAGED_LABEL}=true..."
gcloud compute tpus tpu-vm list \
--project="$PROJECT" --zone=- \
--filter="labels.${MANAGED_LABEL}=true" \
--uri 2>/dev/null \
| while read -r uri; do
[ -z "$uri" ] && continue
tpu_name=$(echo "$uri" | awk -F/ '{print $NF}')
tpu_zone=$(echo "$uri" | awk -F/ '{print $(NF-2)}')
echo "Deleting TPU $tpu_name ($tpu_zone)"
gcloud compute tpus tpu-vm delete "$tpu_name" \
--project="$PROJECT" --zone="$tpu_zone" --quiet --async || true
done
- name: Set commit status to result
if: github.event_name == 'issue_comment'
env:
GH_TOKEN: ${{ github.token }}
run: |
sha=$(git rev-parse HEAD)
if [ "${{ needs.cloud-smoke-test.result }}" = "success" ]; then
state=success
else
state=failure
fi
gh api repos/${{ github.repository }}/statuses/"$sha" \
-f state="$state" \
-f context="Iris Cloud Smoke Test" \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"