Marin - Datakit Smoke #12
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Marin - Datakit Smoke | |
| on: | |
| schedule: | |
| - cron: '30 6 * * *' # Daily at 06:30 UTC, offset from canary | |
| workflow_dispatch: | |
| permissions: | |
| contents: read | |
| issues: write # claude triage files issues | |
| id-token: write # claude-code-action OIDC | |
| jobs: | |
| datakit-smoke: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 120 | |
| concurrency: | |
| group: datakit-smoke | |
| cancel-in-progress: true | |
| env: | |
| SMOKE_RUN_ID: datakit-smoke-${{ github.run_id }}-${{ github.run_attempt }} | |
| FERRY_STATUS_PATH: gs://marin-tmp-us-central1/ttl=1d/ci/datakit-smoke-${{ github.run_id }}-${{ github.run_attempt }}/ferry_run_status.json | |
| WANDB_ENTITY: marin-community | |
| WANDB_PROJECT: marin | |
| IRIS_CONFIG: lib/iris/examples/marin.yaml | |
| IRIS_CONTROLLER_SERVICE_ACCOUNT: iris-controller@hai-gcp-models.iam.gserviceaccount.com | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| - name: Install dependencies | |
| run: uv sync --all-packages --extra=cpu --no-default-groups | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v2 | |
| with: | |
| credentials_json: ${{ secrets.IRIS_CI_GCP_SA_KEY }} | |
| - name: Set up Google Cloud SDK | |
| uses: google-github-actions/setup-gcloud@v2 | |
| with: | |
| project_id: ${{ secrets.GCP_PROJECT_ID }} | |
| - name: Submit datakit smoke ferry | |
| id: submit | |
| shell: bash -l {0} | |
| run: | | |
| JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ | |
| job run --no-wait \ | |
| --memory=2G --disk=4G --cpu=1 --extra=cpu \ | |
| --priority production \ | |
| -e SMOKE_RUN_ID "$SMOKE_RUN_ID" \ | |
| -e FERRY_STATUS_PATH "$FERRY_STATUS_PATH" \ | |
| -e WANDB_ENTITY "$WANDB_ENTITY" \ | |
| -e WANDB_PROJECT "$WANDB_PROJECT" \ | |
| -e WANDB_API_KEY "$WANDB_API_KEY" \ | |
| -e HF_TOKEN "$HF_TOKEN" \ | |
| -- python -m experiments.ferries.datakit_ferry) | |
| echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT" | |
| echo "Submitted job: $JOB_ID" | |
| env: | |
| WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| - name: Wait for datakit smoke ferry | |
| shell: bash -l {0} | |
| run: | | |
| JOB_ID="${{ steps.submit.outputs.job_id }}" | |
| echo "Polling job status: $JOB_ID" | |
| while true; do | |
| STATE=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ | |
| job list --json --prefix "$JOB_ID" \ | |
| | jq -r --arg id "$JOB_ID" '[.[] | select(.job_id == $id)][0].state // empty') | |
| case "$STATE" in | |
| JOB_STATE_SUCCEEDED) | |
| echo "Job succeeded" | |
| exit 0 | |
| ;; | |
| JOB_STATE_PENDING|JOB_STATE_BUILDING|JOB_STATE_RUNNING) | |
| echo "$(date -u +%H:%M:%S) Job state: $STATE" | |
| sleep 30 | |
| ;; | |
| "") | |
| echo "Job not found: $JOB_ID" | |
| exit 1 | |
| ;; | |
| *) | |
| echo "Job finished with state: $STATE" | |
| .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ | |
| job list --json --prefix "$JOB_ID" \ | |
| | jq --arg id "$JOB_ID" '.[] | {job_id, state, error}' || true | |
| exit 1 | |
| ;; | |
| esac | |
| done | |
| - name: Read ferry status | |
| id: ferry_status | |
| shell: bash -l {0} | |
| run: | | |
| PREFIX=$(.venv/bin/python -c " | |
| import json | |
| from rigging.filesystem import url_to_fs | |
| fs, _ = url_to_fs('$FERRY_STATUS_PATH') | |
| with fs.open('$FERRY_STATUS_PATH') as f: | |
| print(json.load(f)['marin_prefix']) | |
| ") | |
| echo "marin_prefix=$PREFIX" >> "$GITHUB_OUTPUT" | |
| echo "Ferry output prefix: $PREFIX" | |
| - name: Validate datakit smoke outputs | |
| shell: bash -l {0} | |
| env: | |
| MARIN_PREFIX: ${{ steps.ferry_status.outputs.marin_prefix }} | |
| run: .venv/bin/python scripts/datakit/validate_ferry_outputs.py | |
| - name: Capture failure diagnostics | |
| if: failure() || cancelled() | |
| run: | | |
| echo "=== Controller logs ===" | |
| .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ | |
| process logs --max-lines=200 || true | |
| echo "=== Job list ===" | |
| .venv/bin/iris --config=${{ env.IRIS_CONFIG }} \ | |
| job list --json 2>/dev/null | jq '.[0:5]' || true | |
| # The canary-triage skill handles both lanes; CANARY_LANE selects datakit-smoke vs tpu. | |
| - name: Claude triage | |
| id: claude_triage | |
| if: (failure() || cancelled()) && github.event_name == 'schedule' | |
| uses: anthropics/claude-code-action@v1 | |
| timeout-minutes: 30 | |
| with: | |
| claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN || secrets.CLAUDE_MAX_OAUTH_TOKEN }} | |
| prompt: | | |
| Read .agents/skills/canary-triage/SKILL.md and follow it. | |
| claude_args: | | |
| --model opus | |
| --max-turns 500 | |
| --allowedTools "Bash(gh:*),Bash(.venv/bin/iris:*),Bash(.venv/bin/python:*),Bash(cat:*),Bash(jq:*),Bash(head:*),Bash(tail:*),Bash(grep:*)" | |
| env: | |
| CANARY_LANE: datakit-smoke | |
| CANARY_JOB_ID: ${{ steps.submit.outputs.job_id }} | |
| CANARY_RUN_ID: ${{ env.SMOKE_RUN_ID }} | |
| IRIS_CONFIG: ${{ env.IRIS_CONFIG }} | |
| WANDB_ENTITY: ${{ env.WANDB_ENTITY }} | |
| WANDB_PROJECT: ${{ env.WANDB_PROJECT }} | |
| WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
| GHA_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} | |
| # Upload Claude's Slack message (if it was written) so the separate | |
| # notify-slack job can pick it up. We run Slack notify in a separate | |
| # job because a job-level timeout can force-kill this runner before | |
| # in-job steps after Claude triage ever execute — see | |
| # https://github.com/marin-community/marin/actions/runs/24498461666. | |
| - name: Upload Slack message | |
| if: failure() || cancelled() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: slack-message | |
| path: slack_message.md | |
| retention-days: 1 | |
| if-no-files-found: ignore | |
| # Separate job so Slack always fires, even if the main job is force-killed | |
| # after its grace window. `needs.datakit-smoke.result` reflects the main | |
| # job outcome; failure()/cancelled() context functions only see this job's | |
| # steps. | |
| notify-slack: | |
| needs: datakit-smoke | |
| if: always() && (needs.datakit-smoke.result == 'failure' || needs.datakit-smoke.result == 'cancelled') && github.event_name == 'schedule' | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Download Slack message | |
| uses: actions/download-artifact@v4 | |
| continue-on-error: true | |
| with: | |
| name: slack-message | |
| - name: Notify Slack | |
| env: | |
| SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} | |
| FALLBACK_TEXT: ":red_circle: *Datakit Smoke failed*\nRun: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" | |
| run: | | |
| if [ -f slack_message.md ]; then | |
| TEXT=$(cat slack_message.md) | |
| else | |
| TEXT="$FALLBACK_TEXT" | |
| fi | |
| PAYLOAD=$(python3 -c "import sys,json; print(json.dumps({'text': sys.stdin.read()}))" <<< "$TEXT") | |
| curl -sf -X POST -H 'Content-Type: application/json' -d "$PAYLOAD" "$SLACK_WEBHOOK_URL" |