[iris] Remove broken test_worker_restart_preserves_task smoke test #2526
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Iris - Cloud Smoke Test (GCP) | |
| on: | |
| pull_request: | |
| paths: | |
| - "lib/iris/**" | |
| issue_comment: | |
| types: [created] | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| packages: write | |
| statuses: write # post commit status from issue_comment trigger | |
| concurrency: | |
| group: iris-cloud-smoke-${{ github.event.pull_request.number || github.event.issue.number || github.run_id }} | |
| # Never kill a running cloud test — would leak GCP resources. | |
| cancel-in-progress: false | |
| jobs: | |
| cloud-smoke-test: | |
| # Run on: PRs touching lib/iris/**, /iris-smoke comment, or manual dispatch | |
| if: >- | |
| github.event_name == 'pull_request' || | |
| github.event_name == 'workflow_dispatch' || | |
| ( | |
| github.event_name == 'issue_comment' && | |
| github.event.issue.pull_request && | |
| contains(github.event.comment.body, '/iris-smoke') && | |
| ( | |
| github.event.comment.author_association == 'MEMBER' || | |
| github.event.comment.author_association == 'COLLABORATOR' || | |
| github.event.comment.author_association == 'OWNER' | |
| ) | |
| ) | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 45 | |
| env: | |
| IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com | |
| outputs: | |
| label-prefix: ${{ steps.label.outputs.prefix }} | |
| steps: | |
| - name: Compute label prefix | |
| id: label | |
| run: | | |
| if [ "${{ github.event_name }}" = "pull_request" ]; then | |
| echo "prefix=smoke-pr-${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT" | |
| elif [ "${{ github.event_name }}" = "issue_comment" ]; then | |
| echo "prefix=smoke-pr-${{ github.event.issue.number }}" >> "$GITHUB_OUTPUT" | |
| else | |
| # workflow_dispatch: use short SHA | |
| echo "prefix=smoke-$(echo ${{ github.sha }} | head -c 8)" >> "$GITHUB_OUTPUT" | |
| fi | |
| - name: Set commit status to pending | |
| if: github.event_name == 'issue_comment' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| gh api repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha || github.sha }} \ | |
| -f state=pending \ | |
| -f context="Iris Cloud Smoke Test" \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" || true | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| # pull_request uses default merge ref; issue_comment needs explicit PR head | |
| ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }} | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| cache-dependency-glob: "lib/iris/pyproject.toml" | |
| - name: Cache Playwright browsers | |
| id: playwright-cache | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.cache/ms-playwright | |
| key: playwright-${{ runner.os }}-${{ hashFiles('lib/iris/pyproject.toml') }} | |
| - name: Install Playwright browsers | |
| if: steps.playwright-cache.outputs.cache-hit != 'true' | |
| run: cd lib/iris && uv run playwright install --with-deps chromium | |
| - name: Install Playwright system deps | |
| if: steps.playwright-cache.outputs.cache-hit == 'true' | |
| run: npx --yes playwright@1.57.0 install-deps chromium | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }} | |
| - name: Set up Google Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| with: | |
| project_id: ${{ secrets.GCP_PROJECT_ID }} | |
| - name: Set up OS Login SSH key | |
| run: | | |
| mkdir -p ~/.ssh | |
| ssh-keygen -t rsa -b 4096 -f ~/.ssh/google_compute_engine -N "" -q -C "gha-${{ github.run_id }}-${{ github.run_attempt }}" | |
| chmod 600 ~/.ssh/google_compute_engine | |
| gcloud compute os-login ssh-keys add \ | |
| --key-file ~/.ssh/google_compute_engine.pub \ | |
| --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \ | |
| --ttl=2h | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v3 | |
| - name: Log in to GitHub Container Registry | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Start smoke cluster | |
| env: | |
| LABEL_PREFIX: ${{ steps.label.outputs.prefix }} | |
| MARIN_PREFIX: gs://marin-eu-west4 | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=examples/smoke-gcp.yaml \ | |
| cluster start-smoke \ | |
| --label-prefix "$LABEL_PREFIX" \ | |
| --url-file /tmp/iris-controller-url \ | |
| --wait-for-workers 1 \ | |
| --worker-timeout 600 & | |
| START_PID=$! | |
| echo "START_PID=$START_PID" >> "$GITHUB_ENV" | |
| for i in $(seq 1 900); do | |
| if [ -f /tmp/iris-controller-url ]; then | |
| echo "Cluster ready!" | |
| echo "IRIS_CONTROLLER_URL=$(cat /tmp/iris-controller-url)" >> "$GITHUB_ENV" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| if [ ! -f /tmp/iris-controller-url ]; then | |
| echo "Timed out waiting for cluster" | |
| kill $START_PID 2>/dev/null || true | |
| exit 1 | |
| fi | |
| - name: Run cloud smoke tests | |
| env: | |
| IRIS_SCREENSHOT_DIR: ${{ github.workspace }}/iris-cloud-screenshots | |
| PYTHONASYNCIODEBUG: "1" | |
| run: | | |
| mkdir -p ${{ github.workspace }}/iris-cloud-screenshots | |
| cd lib/iris && uv run --group dev python -m pytest \ | |
| tests/e2e/test_smoke.py \ | |
| -m e2e \ | |
| --iris-controller-url "$IRIS_CONTROLLER_URL" \ | |
| -o "addopts=" \ | |
| --tb=short -v \ | |
| --timeout=1200 | |
| - name: Collect controller and worker logs | |
| if: failure() | |
| env: | |
| LOG_DIR: ${{ github.workspace }}/iris-cloud-logs | |
| LABEL_PREFIX: ${{ steps.label.outputs.prefix }} | |
| PROJECT: ${{ secrets.GCP_PROJECT_ID }} | |
| run: | | |
| mkdir -p "$LOG_DIR" | |
| MANAGED_LABEL="iris-${LABEL_PREFIX}-managed" | |
| CONTROLLER_LABEL="iris-${LABEL_PREFIX}-controller" | |
| # Always try to grab docker/host logs directly from every VM we | |
| # spun up. This works even when the controller never became | |
| # reachable (e.g. tunnel never opened, controller crashed during | |
| # boot), which is exactly the case where these logs matter most. | |
| gcloud compute instances list \ | |
| --project="$PROJECT" \ | |
| --filter="labels.${MANAGED_LABEL}=true OR labels.${CONTROLLER_LABEL}=true" \ | |
| --format="csv[no-heading](name,zone,labels.list())" 2>/dev/null \ | |
| | while IFS=, read -r name zone labels; do | |
| [ -z "$name" ] && continue | |
| role=worker | |
| case "$labels" in *"$CONTROLLER_LABEL"*) role=controller ;; esac | |
| echo "Fetching host logs from $role $name ($zone)" | |
| out="$LOG_DIR/${role}-${name}" | |
| gcloud compute ssh "$name" \ | |
| --project="$PROJECT" --zone="$zone" \ | |
| --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \ | |
| --ssh-key-file ~/.ssh/google_compute_engine \ | |
| --quiet \ | |
| --command ' | |
| set +e | |
| echo "=== docker ps -a ===" | |
| sudo docker ps -a | |
| for cid in $(sudo docker ps -aq); do | |
| echo "=== docker logs $cid ===" | |
| sudo docker logs --timestamps --tail 5000 "$cid" 2>&1 | |
| done | |
| echo "=== startup script ===" | |
| sudo journalctl -u google-startup-scripts.service --no-pager 2>&1 | tail -n 2000 | |
| echo "=== kernel/cloud-init ===" | |
| sudo journalctl -u cloud-final.service --no-pager 2>&1 | tail -n 500 | |
| ' > "${out}.log" 2>&1 || echo "ssh to $name failed (see ${out}.log)" | |
| done | |
| if [ -z "$IRIS_CONTROLLER_URL" ]; then | |
| echo "No controller URL, skipping RPC-based log collection" | |
| exit 0 | |
| fi | |
| cd lib/iris | |
| # Cluster status (controller process info + worker summary) | |
| uv run --group dev iris \ | |
| --config=examples/smoke-gcp.yaml \ | |
| cluster status \ | |
| --url "$IRIS_CONTROLLER_URL" \ | |
| > "$LOG_DIR/cluster-status.txt" 2>&1 || true | |
| # Fetch recent controller and worker logs via RPC | |
| uv run --group dev python3 -c " | |
| import json, sys | |
| from iris.rpc import controller_connect | |
| from iris.rpc import controller_pb2, job_pb2 | |
| url = '$IRIS_CONTROLLER_URL' | |
| client = controller_connect.ControllerServiceClientSync(url) | |
| # Controller process status with recent logs | |
| proc = client.get_process_status(job_pb2.GetProcessStatusRequest()) | |
| with open('$LOG_DIR/controller-process.json', 'w') as f: | |
| json.dump({ | |
| 'git_hash': proc.process_info.git_hash, | |
| 'pid': proc.process_info.pid, | |
| 'memory_mb': proc.process_info.memory_mb, | |
| 'recent_logs': [l.message for l in proc.process_info.recent_logs], | |
| }, f, indent=2) | |
| # List workers and fetch status for each | |
| workers = client.list_workers(controller_pb2.Controller.ListWorkersRequest()).workers | |
| for w in workers: | |
| wid = w.worker_id | |
| try: | |
| status = client.get_worker_status( | |
| controller_pb2.Controller.GetWorkerStatusRequest(worker_id=wid) | |
| ) | |
| with open(f'$LOG_DIR/worker-{wid}.json', 'w') as f: | |
| json.dump({ | |
| 'worker_id': wid, | |
| 'healthy': w.healthy, | |
| 'address': w.address, | |
| 'recent_logs': [l.message for l in (status.process_info.recent_logs if status.HasField('process_info') else [])], | |
| }, f, indent=2) | |
| except Exception as e: | |
| with open(f'$LOG_DIR/worker-{wid}.txt', 'w') as f: | |
| f.write(f'Failed to fetch status: {e}\n') | |
| " 2>&1 || true | |
| - name: Upload logs | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: iris-cloud-smoke-logs | |
| path: iris-cloud-logs/ | |
| retention-days: 14 | |
| if-no-files-found: ignore | |
| - name: Stop tunnel | |
| if: always() | |
| run: | | |
| kill $START_PID 2>/dev/null || true | |
| - name: Tear down smoke cluster | |
| if: always() | |
| env: | |
| LABEL_PREFIX: ${{ steps.label.outputs.prefix }} | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=examples/smoke-gcp.yaml \ | |
| cluster stop --label "$LABEL_PREFIX" || true | |
| - name: Remove OS Login SSH key | |
| if: always() | |
| run: | | |
| gcloud compute os-login ssh-keys remove \ | |
| --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \ | |
| --key-file ~/.ssh/google_compute_engine.pub || true | |
| - name: Upload screenshots | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: iris-cloud-smoke-screenshots | |
| path: iris-cloud-screenshots/ | |
| retention-days: 14 | |
| if-no-files-found: ignore | |
| # Separate job so cleanup runs even if the test job times out or is killed. | |
| # Skipped when cloud-smoke-test was skipped (e.g. non-matching comment). | |
| cleanup: | |
| needs: cloud-smoke-test | |
| if: ${{ always() && needs.cloud-smoke-test.result != 'skipped' }} | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| env: | |
| LABEL_PREFIX: ${{ needs.cloud-smoke-test.outputs.label-prefix }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event_name == 'issue_comment' && format('refs/pull/{0}/head', github.event.issue.number) || '' }} | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| cache-dependency-glob: "lib/iris/pyproject.toml" | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }} | |
| - name: Set up Google Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| with: | |
| project_id: ${{ secrets.GCP_PROJECT_ID }} | |
| - name: Tear down cluster (iris stop) | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=examples/smoke-gcp.yaml \ | |
| cluster stop --label "$LABEL_PREFIX" || true | |
| - name: Failsafe GCP resource cleanup | |
| run: | | |
| MANAGED_LABEL="iris-${LABEL_PREFIX}-managed" | |
| CONTROLLER_LABEL="iris-${LABEL_PREFIX}-controller" | |
| PROJECT="${{ secrets.GCP_PROJECT_ID }}" | |
| echo "Cleaning up GCE instances with label ${MANAGED_LABEL}=true OR ${CONTROLLER_LABEL}=true..." | |
| gcloud compute instances list \ | |
| --project="$PROJECT" \ | |
| --filter="labels.${MANAGED_LABEL}=true OR labels.${CONTROLLER_LABEL}=true" \ | |
| --format="csv[no-heading](name,zone)" 2>/dev/null \ | |
| | while IFS=, read -r name zone; do | |
| [ -z "$name" ] && continue | |
| echo "Deleting instance $name ($zone)" | |
| gcloud compute instances delete "$name" \ | |
| --project="$PROJECT" --zone="$zone" --quiet || true | |
| done | |
| echo "Cleaning up TPU VMs with label ${MANAGED_LABEL}=true..." | |
| gcloud compute tpus tpu-vm list \ | |
| --project="$PROJECT" --zone=- \ | |
| --filter="labels.${MANAGED_LABEL}=true" \ | |
| --uri 2>/dev/null \ | |
| | while read -r uri; do | |
| [ -z "$uri" ] && continue | |
| tpu_name=$(echo "$uri" | awk -F/ '{print $NF}') | |
| tpu_zone=$(echo "$uri" | awk -F/ '{print $(NF-2)}') | |
| echo "Deleting TPU $tpu_name ($tpu_zone)" | |
| gcloud compute tpus tpu-vm delete "$tpu_name" \ | |
| --project="$PROJECT" --zone="$tpu_zone" --quiet --async || true | |
| done | |
| - name: Set commit status to result | |
| if: github.event_name == 'issue_comment' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| sha=$(git rev-parse HEAD) | |
| if [ "${{ needs.cloud-smoke-test.result }}" = "success" ]; then | |
| state=success | |
| else | |
| state=failure | |
| fi | |
| gh api repos/${{ github.repository }}/statuses/"$sha" \ | |
| -f state="$state" \ | |
| -f context="Iris Cloud Smoke Test" \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" |