Skip to content

Marin - CoreWeave GPU Canary Ferry #96

Marin - CoreWeave GPU Canary Ferry

Marin - CoreWeave GPU Canary Ferry #96

name: Marin - CoreWeave GPU Canary Ferry
on:
schedule:
- cron: '0 10 * * *' # Daily at 10 AM UTC
workflow_dispatch:
inputs:
target_tokens:
description: 'Override training token budget'
type: number
required: false
keep_nodepool:
description: 'Keep CW node pool alive after the run (for faster re-runs)'
type: boolean
default: false
permissions:
contents: read # actions/checkout
packages: write # docker login ghcr.io for iris cluster start
issues: write # claude triage files issues
id-token: write # claude-code-action OIDC
jobs:
canary-ferry-cw:
runs-on: ubuntu-latest
timeout-minutes: 180
concurrency:
group: canary-ferry-cw-iris-canary
cancel-in-progress: true
env:
RUN_ID: canary-gpu-${{ github.run_id }}-${{ github.run_attempt }}
CANARY_ACCELERATOR: gpu
CANARY_BATCH_SIZE: "32"
CANARY_TARGET_TOKENS: "6553600"
CANARY_MIN_STEPS: "40"
CANARY_MAX_LOSS: "8.0"
CANARY_MAX_WALL_CLOCK: "7200"
WANDB_ENTITY: marin-community
WANDB_PROJECT: marin
IRIS_CONFIG: lib/iris/examples/coreweave-canary.yaml
# Must match the label_prefix and namespace in IRIS_CONFIG so teardown
# targets only this cluster's resources.
IRIS_LABEL_PREFIX: iris-canary
IRIS_NAMESPACE: iris-canary
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python 3.12
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
- name: Install dependencies
run: uv sync --all-packages --extra=cpu --no-default-groups
- name: Write CoreWeave kubeconfig
run: |
mkdir -p ~/.kube
echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris
chmod 600 ~/.kube/coreweave-iris
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Start CoreWeave cluster
run: .venv/bin/iris -v --config=${{ env.IRIS_CONFIG }} cluster start
env:
BUILDKIT_PROGRESS: plain
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
- name: Submit canary ferry
id: submit
shell: bash -l {0}
run: |
JOB_ID=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job run --no-wait \
--memory=16G --disk=16G --cpu=1 --extra=cpu \
-e MARIN_PREFIX s3://marin-na/marin/ \
-e RUN_ID "$RUN_ID" \
-e CANARY_ACCELERATOR "$CANARY_ACCELERATOR" \
-e CANARY_BATCH_SIZE "$CANARY_BATCH_SIZE" \
-e CANARY_TARGET_TOKENS "$CANARY_TARGET_TOKENS" \
-e WANDB_ENTITY "$WANDB_ENTITY" \
-e WANDB_PROJECT "$WANDB_PROJECT" \
-e WANDB_API_KEY "$WANDB_API_KEY" \
-e HF_TOKEN "$HF_TOKEN" \
-e AWS_ACCESS_KEY_ID "$R2_ACCESS_KEY_ID" \
-e AWS_SECRET_ACCESS_KEY "$R2_SECRET_ACCESS_KEY" \
-e AWS_ENDPOINT_URL "https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com" \
-- python -m experiments.ferries.canary_ferry)
echo "job_id=$JOB_ID" >> "$GITHUB_OUTPUT"
echo "Submitted job: $JOB_ID"
env:
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
CANARY_TARGET_TOKENS: ${{ inputs.target_tokens || env.CANARY_TARGET_TOKENS }}
- name: Wait for canary ferry
shell: bash -l {0}
run: |
JOB_ID="${{ steps.submit.outputs.job_id }}"
echo "Polling job status: $JOB_ID"
while true; do
STATE=$(.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job list --json --prefix "$JOB_ID" \
| jq -r --arg id "$JOB_ID" '[.[] | select(.job_id == $id)][0].state // empty')
case "$STATE" in
JOB_STATE_SUCCEEDED)
echo "Job succeeded"
exit 0
;;
JOB_STATE_PENDING|JOB_STATE_BUILDING|JOB_STATE_RUNNING)
echo "$(date -u +%H:%M:%S) Job state: $STATE"
sleep 30
;;
"")
echo "Job not found: $JOB_ID"
exit 1
;;
*)
echo "Job finished with state: $STATE"
.venv/bin/iris --config=${{ env.IRIS_CONFIG }} \
job list --json --prefix "$JOB_ID" \
| jq --arg id "$JOB_ID" '.[] | {job_id, state, error}' || true
exit 1
;;
esac
done
- name: Validate canary metrics
continue-on-error: ${{ inputs.target_tokens != '' }}
shell: bash -l {0}
run: .venv/bin/python scripts/canary/validate_canary_metrics.py
env:
MARIN_PREFIX: s3://marin-na/marin/
AWS_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }}
AWS_ENDPOINT_URL: https://74981a43be0de7712369306c7b19133d.r2.cloudflarestorage.com
- name: Summarize GPU canary profile
continue-on-error: true
shell: bash -l {0}
run: |
.venv/bin/python -m marin.profiling.cli summarize \
--run-target "$RUN_ID" \
--entity "$WANDB_ENTITY" \
--project "$WANDB_PROJECT"
env:
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
- name: Capture failure diagnostics
if: failure()
run: |
echo "=== Controller logs ==="
kubectl --kubeconfig ~/.kube/coreweave-iris -n ${{ env.IRIS_NAMESPACE }} \
logs -l app=iris-controller --tail=500 || true
echo "=== Controller pod describe ==="
kubectl --kubeconfig ~/.kube/coreweave-iris -n ${{ env.IRIS_NAMESPACE }} \
describe pod -l app=iris-controller || true
echo "=== Task pod logs ==="
kubectl --kubeconfig ~/.kube/coreweave-iris -n ${{ env.IRIS_NAMESPACE }} \
logs -l iris.managed=true --tail=200 || true
echo "=== Task pod describe ==="
kubectl --kubeconfig ~/.kube/coreweave-iris -n ${{ env.IRIS_NAMESPACE }} \
describe pod -l iris.managed=true || true
echo "=== Warning events ==="
kubectl --kubeconfig ~/.kube/coreweave-iris -n ${{ env.IRIS_NAMESPACE }} \
get events --sort-by='.lastTimestamp' --field-selector type!=Normal || true
- name: Claude triage
id: claude_triage
if: failure() && github.event_name == 'schedule'
uses: anthropics/claude-code-action@v1
timeout-minutes: 30
with:
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN || secrets.CLAUDE_MAX_OAUTH_TOKEN }}
prompt: |
Read .agents/skills/canary-triage/SKILL.md and follow it.
claude_args: |
--model opus
--max-turns 50
--allowedTools "Bash(kubectl:*),Bash(gh:*),Bash(.venv/bin/iris:*),Bash(.venv/bin/python:*),Bash(cat:*),Bash(jq:*),Bash(head:*),Bash(tail:*),Bash(grep:*)"
env:
CANARY_LANE: gpu
CANARY_JOB_ID: ${{ steps.submit.outputs.job_id }}
CANARY_RUN_ID: ${{ env.RUN_ID }}
IRIS_CONFIG: ${{ env.IRIS_CONFIG }}
IRIS_NAMESPACE: ${{ env.IRIS_NAMESPACE }}
WANDB_ENTITY: ${{ env.WANDB_ENTITY }}
WANDB_PROJECT: ${{ env.WANDB_PROJECT }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
GHA_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
# `cluster stop` only deletes Pods; NodePools survive and rely on the
# CW autoscaler to scale down. Delete them explicitly to avoid lingering
# H100 costs.
- name: Tear down CoreWeave cluster
if: always()
run: |
.venv/bin/iris -v --config=${{ env.IRIS_CONFIG }} cluster stop || true
if [ "${{ inputs.keep_nodepool }}" != "true" ]; then
kubectl --kubeconfig ~/.kube/coreweave-iris \
delete nodepool -l iris-${{ env.IRIS_LABEL_PREFIX }}-managed=true
else
echo "Keeping node pool alive (keep_nodepool=true)"
fi
- name: Notify Slack on failure
if: failure() && github.event_name == 'schedule'
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
FALLBACK_TEXT: ":red_circle: *GPU Canary failed*\nRun: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
run: |
if [ -f slack_message.md ]; then
TEXT=$(cat slack_message.md)
else
TEXT="$FALLBACK_TEXT"
fi
PAYLOAD=$(python3 -c "import sys,json; print(json.dumps({'text': sys.stdin.read()}))" <<< "$TEXT")
curl -sf -X POST -H 'Content-Type: application/json' -d "$PAYLOAD" "$SLACK_WEBHOOK_URL"