[zephyr] Add finelog stats for stages and workers #2597
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: "Iris - Smoke - GCP" | |
| on: | |
| pull_request: | |
| paths: | |
| - "lib/iris/**" | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| packages: write | |
| concurrency: | |
| group: iris-cloud-smoke-${{ github.event.pull_request.number || github.run_id }} | |
| # Never kill a running cloud test — would leak GCP resources. | |
| cancel-in-progress: false | |
| jobs: | |
| cloud-smoke-test: | |
| # Run on: PRs touching lib/iris/**, or manual dispatch | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 45 | |
| env: | |
| IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com | |
| outputs: | |
| label-prefix: ${{ steps.label.outputs.prefix }} | |
| steps: | |
| - name: Compute label prefix | |
| id: label | |
| run: | | |
| if [ "${{ github.event_name }}" = "pull_request" ]; then | |
| echo "prefix=smoke-pr-${{ github.event.pull_request.number }}" >> "$GITHUB_OUTPUT" | |
| else | |
| # workflow_dispatch: use short SHA | |
| echo "prefix=smoke-$(echo ${{ github.sha }} | head -c 8)" >> "$GITHUB_OUTPUT" | |
| fi | |
| - name: Checkout code | |
| uses: actions/checkout@v5 | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| cache-dependency-glob: "lib/iris/pyproject.toml" | |
| - name: Cache Playwright browsers | |
| id: playwright-cache | |
| uses: actions/cache@v4 | |
| with: | |
| path: ~/.cache/ms-playwright | |
| key: playwright-${{ runner.os }}-${{ hashFiles('lib/iris/pyproject.toml') }} | |
| - name: Install Playwright browsers | |
| if: steps.playwright-cache.outputs.cache-hit != 'true' | |
| run: cd lib/iris && uv run playwright install --with-deps chromium | |
| - name: Install Playwright system deps | |
| if: steps.playwright-cache.outputs.cache-hit == 'true' | |
| run: npx --yes playwright@1.57.0 install-deps chromium | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }} | |
| - name: Set up Google Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| with: | |
| project_id: ${{ secrets.GCP_PROJECT_ID }} | |
| - name: Set up OS Login SSH key | |
| run: | | |
| mkdir -p ~/.ssh | |
| ssh-keygen -t rsa -b 4096 -f ~/.ssh/google_compute_engine -N "" -q -C "gha-${{ github.run_id }}-${{ github.run_attempt }}" | |
| chmod 600 ~/.ssh/google_compute_engine | |
| gcloud compute os-login ssh-keys add \ | |
| --key-file ~/.ssh/google_compute_engine.pub \ | |
| --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \ | |
| --ttl=2h | |
| - name: Set up Docker Buildx | |
| uses: docker/setup-buildx-action@v4 | |
| - name: Log in to GitHub Container Registry | |
| uses: docker/login-action@v4 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Start smoke cluster | |
| env: | |
| LABEL_PREFIX: ${{ steps.label.outputs.prefix }} | |
| MARIN_PREFIX: gs://marin-eu-west4 | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=config/smoke-gcp.yaml \ | |
| cluster start-smoke \ | |
| --label-prefix "$LABEL_PREFIX" \ | |
| --url-file /tmp/iris-controller-url \ | |
| --wait-for-workers 1 \ | |
| --worker-timeout 600 & | |
| START_PID=$! | |
| echo "START_PID=$START_PID" >> "$GITHUB_ENV" | |
| for i in $(seq 1 900); do | |
| if [ -f /tmp/iris-controller-url ]; then | |
| echo "Cluster ready!" | |
| echo "IRIS_CONTROLLER_URL=$(cat /tmp/iris-controller-url)" >> "$GITHUB_ENV" | |
| break | |
| fi | |
| sleep 2 | |
| done | |
| if [ ! -f /tmp/iris-controller-url ]; then | |
| echo "Timed out waiting for cluster" | |
| kill $START_PID 2>/dev/null || true | |
| exit 1 | |
| fi | |
| - name: Run cloud smoke tests | |
| env: | |
| IRIS_SCREENSHOT_DIR: ${{ github.workspace }}/iris-cloud-screenshots | |
| PYTHONASYNCIODEBUG: "1" | |
| run: | | |
| mkdir -p ${{ github.workspace }}/iris-cloud-screenshots | |
| cd lib/iris && uv run --group dev python -m pytest \ | |
| tests/e2e/test_smoke.py \ | |
| -m requires_cluster \ | |
| --iris-controller-url "$IRIS_CONTROLLER_URL" \ | |
| -o "addopts=" \ | |
| --tb=short -v \ | |
| --timeout=1200 | |
| - name: Collect controller and worker logs | |
| if: failure() | |
| continue-on-error: true | |
| env: | |
| LOG_DIR: ${{ github.workspace }}/iris-cloud-logs | |
| LABEL_PREFIX: ${{ steps.label.outputs.prefix }} | |
| PROJECT: ${{ secrets.GCP_PROJECT_ID }} | |
| run: | | |
| ARGS=( | |
| collect | |
| --job-id "smoke-${LABEL_PREFIX}" | |
| --provider gcp | |
| --output-dir "$LOG_DIR" | |
| --project "$PROJECT" | |
| --controller-label "iris-${LABEL_PREFIX}-controller" | |
| --managed-label "iris-${LABEL_PREFIX}-managed" | |
| --service-account "$IRIS_CONTROLLER_SERVICE_ACCOUNT" | |
| --ssh-key ~/.ssh/google_compute_engine | |
| ) | |
| if [ -n "${IRIS_CONTROLLER_URL:-}" ]; then | |
| ARGS+=(--controller-url "$IRIS_CONTROLLER_URL") | |
| fi | |
| uv run python scripts/workflows/iris_monitor.py "${ARGS[@]}" | |
| - name: Upload logs | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: iris-cloud-smoke-logs | |
| path: iris-cloud-logs/ | |
| retention-days: 14 | |
| if-no-files-found: ignore | |
| - name: Stop tunnel | |
| if: always() | |
| run: | | |
| kill $START_PID 2>/dev/null || true | |
| - name: Tear down smoke cluster | |
| if: always() | |
| env: | |
| LABEL_PREFIX: ${{ steps.label.outputs.prefix }} | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=config/smoke-gcp.yaml \ | |
| cluster stop --label "$LABEL_PREFIX" || true | |
| - name: Remove OS Login SSH key | |
| if: always() | |
| run: | | |
| gcloud compute os-login ssh-keys remove \ | |
| --impersonate-service-account="$IRIS_CONTROLLER_SERVICE_ACCOUNT" \ | |
| --key-file ~/.ssh/google_compute_engine.pub || true | |
| - name: Upload screenshots | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: iris-cloud-smoke-screenshots | |
| path: iris-cloud-screenshots/ | |
| retention-days: 14 | |
| if-no-files-found: ignore | |
| # Separate job so cleanup runs even if the test job times out or is killed. | |
| # Skipped when cloud-smoke-test was skipped (e.g. non-matching comment). | |
| cleanup: | |
| needs: cloud-smoke-test | |
| if: ${{ always() && needs.cloud-smoke-test.result != 'skipped' }} | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| env: | |
| LABEL_PREFIX: ${{ needs.cloud-smoke-test.outputs.label-prefix }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v5 | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| cache-dependency-glob: "lib/iris/pyproject.toml" | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }} | |
| - name: Set up Google Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| with: | |
| project_id: ${{ secrets.GCP_PROJECT_ID }} | |
| - name: Tear down cluster (iris stop) | |
| run: | | |
| cd lib/iris && uv run --group dev iris -v \ | |
| --config=config/smoke-gcp.yaml \ | |
| cluster stop --label "$LABEL_PREFIX" || true | |
| - name: Failsafe GCP resource cleanup | |
| run: | | |
| MANAGED_LABEL="iris-${LABEL_PREFIX}-managed" | |
| CONTROLLER_LABEL="iris-${LABEL_PREFIX}-controller" | |
| PROJECT="${{ secrets.GCP_PROJECT_ID }}" | |
| echo "Cleaning up GCE instances with label ${MANAGED_LABEL}=true OR ${CONTROLLER_LABEL}=true..." | |
| gcloud compute instances list \ | |
| --project="$PROJECT" \ | |
| --filter="labels.${MANAGED_LABEL}=true OR labels.${CONTROLLER_LABEL}=true" \ | |
| --format="csv[no-heading](name,zone)" 2>/dev/null \ | |
| | while IFS=, read -r name zone; do | |
| [ -z "$name" ] && continue | |
| echo "Deleting instance $name ($zone)" | |
| gcloud compute instances delete "$name" \ | |
| --project="$PROJECT" --zone="$zone" --quiet || true | |
| done | |
| echo "Cleaning up TPU VMs with label ${MANAGED_LABEL}=true..." | |
| gcloud compute tpus tpu-vm list \ | |
| --project="$PROJECT" --zone=- \ | |
| --filter="labels.${MANAGED_LABEL}=true" \ | |
| --uri 2>/dev/null \ | |
| | while read -r uri; do | |
| [ -z "$uri" ] && continue | |
| tpu_name=$(echo "$uri" | awk -F/ '{print $NF}') | |
| tpu_zone=$(echo "$uri" | awk -F/ '{print $(NF-2)}') | |
| echo "Deleting TPU $tpu_name ($tpu_zone)" | |
| gcloud compute tpus tpu-vm delete "$tpu_name" \ | |
| --project="$PROJECT" --zone="$tpu_zone" --quiet --async || true | |
| done |