Marin - CoreWeave GPU Canary Ferry #35
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Marin - CoreWeave GPU Canary Ferry | |
| on: | |
| schedule: | |
| - cron: '0 10 * * *' # Daily at 10 AM UTC | |
| workflow_dispatch: | |
| permissions: | |
| contents: read # actions/checkout | |
| packages: write # docker login ghcr.io for iris cluster start | |
| jobs: | |
| canary-ferry-cw: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 180 | |
| concurrency: | |
| group: canary-ferry-cw | |
| cancel-in-progress: true | |
| env: | |
| RUN_ID: canary-gpu-${{ github.run_id }}-${{ github.run_attempt }} | |
| WANDB_ENTITY: marin-community | |
| WANDB_PROJECT: marin | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v3 | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| - name: Install dependencies | |
| run: uv sync --all-packages --extra=cpu --no-default-groups | |
| - name: Write CoreWeave kubeconfig | |
| run: | | |
| mkdir -p ~/.kube | |
| echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris | |
| chmod 600 ~/.kube/coreweave-iris | |
| - name: Log in to GitHub Container Registry | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Start CoreWeave cluster | |
| run: .venv/bin/iris -v --config=lib/iris/examples/coreweave.yaml cluster start | |
| env: | |
| BUILDKIT_PROGRESS: plain | |
| R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| - name: Submit canary ferry | |
| shell: bash -l {0} | |
| run: | | |
| .venv/bin/iris --config=lib/iris/examples/coreweave.yaml \ | |
| job run \ | |
| --memory=16G --disk=16G --cpu=1 --extra=cpu \ | |
| -e MARIN_PREFIX s3://marin-us-west-04a/marin/ \ | |
| -e RUN_ID "$RUN_ID" \ | |
| -e WANDB_ENTITY "$WANDB_ENTITY" \ | |
| -e WANDB_PROJECT "$WANDB_PROJECT" \ | |
| -e WANDB_API_KEY "$WANDB_API_KEY" \ | |
| -e HF_TOKEN "$HF_TOKEN" \ | |
| -- python -m experiments.ferries.canary_ferry_cw | |
| env: | |
| WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| # Keep these profiling post-run steps in sync with | |
| # `.github/workflows/marin-canary-ferry.yaml`. | |
| - name: Summarize GPU canary profile | |
| shell: bash -l {0} | |
| run: | | |
| .venv/bin/python -m marin.profiling.cli summarize \ | |
| --run-target "$RUN_ID" \ | |
| --entity "$WANDB_ENTITY" \ | |
| --project "$WANDB_PROJECT" \ | |
| --output profile_summary.json | |
| .venv/bin/python -m marin.profiling.cli report \ | |
| --summary profile_summary.json \ | |
| --output profile_report.md | |
| env: | |
| WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
| - name: Add profile digest to step summary | |
| shell: bash -l {0} | |
| run: | | |
| .venv/bin/python -m marin.profiling.cli digest \ | |
| --summary profile_summary.json \ | |
| --title "CoreWeave GPU Canary Profile Digest" \ | |
| --top-k 5 \ | |
| --output profile_digest.md | |
| cat profile_digest.md >> "$GITHUB_STEP_SUMMARY" | |
| - name: Upload GPU canary profile artifacts | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: gpu-canary-profile-${{ env.RUN_ID }} | |
| path: | | |
| profile_summary.json | |
| profile_report.md | |
| profile_digest.md | |
| # `cluster stop` only deletes Pods; NodePools survive and rely on the | |
| # CW autoscaler to scale down. Delete them explicitly to avoid lingering | |
| # H100 costs. We attempt both even if one fails — cost protection matters | |
| # more than clean state. | |
| # TODO(#3277): Move NodePool teardown into Iris (e.g. `iris cluster stop | |
| # --delete-nodepools`) so CI doesn't leak the `iris-iris-managed` label. | |
| - name: Tear down CoreWeave cluster | |
| if: always() | |
| run: | | |
| .venv/bin/iris -v --config=lib/iris/examples/coreweave.yaml cluster stop || true | |
| kubectl --kubeconfig ~/.kube/coreweave-iris \ | |
| delete nodepool -l iris-iris-managed=true |