Marin - CoreWeave GPU Canary Ferry #44
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Marin - CoreWeave GPU Canary Ferry | |
| on: | |
| schedule: | |
| - cron: '0 10 * * *' # Daily at 10 AM UTC | |
| workflow_dispatch: | |
| inputs: | |
| keep_nodepool: | |
| description: 'Keep CW node pool alive after the run (for faster re-runs)' | |
| type: boolean | |
| default: false | |
| permissions: | |
| contents: read # actions/checkout | |
| packages: write # docker login ghcr.io for iris cluster start | |
| jobs: | |
| canary-ferry-cw: | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 180 | |
| concurrency: | |
| group: canary-ferry-cw | |
| cancel-in-progress: true | |
| env: | |
| RUN_ID: canary-gpu-${{ github.run_id }}-${{ github.run_attempt }} | |
| WANDB_ENTITY: marin-community | |
| WANDB_PROJECT: marin | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v3 | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| - name: Install dependencies | |
| run: uv sync --all-packages --extra=cpu --no-default-groups | |
| - name: Write CoreWeave kubeconfig | |
| run: | | |
| mkdir -p ~/.kube | |
| echo "${{ secrets.CW_KUBECONFIG }}" > ~/.kube/coreweave-iris | |
| chmod 600 ~/.kube/coreweave-iris | |
| - name: Log in to GitHub Container Registry | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Start CoreWeave cluster | |
| run: .venv/bin/iris -v --config=lib/iris/examples/coreweave.yaml cluster start | |
| env: | |
| BUILDKIT_PROGRESS: plain | |
| R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| - name: Submit canary ferry | |
| shell: bash -l {0} | |
| run: | | |
| .venv/bin/iris --config=lib/iris/examples/coreweave.yaml \ | |
| job run \ | |
| --memory=16G --disk=16G --cpu=1 --extra=cpu \ | |
| -e MARIN_PREFIX s3://marin-us-west-04a/marin/ \ | |
| -e RUN_ID "$RUN_ID" \ | |
| -e WANDB_ENTITY "$WANDB_ENTITY" \ | |
| -e WANDB_PROJECT "$WANDB_PROJECT" \ | |
| -e WANDB_API_KEY "$WANDB_API_KEY" \ | |
| -e HF_TOKEN "$HF_TOKEN" \ | |
| -- python -m experiments.ferries.canary_ferry_cw | |
| env: | |
| WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| R2_ACCESS_KEY_ID: ${{ secrets.R2_ACCESS_KEY_ID }} | |
| R2_SECRET_ACCESS_KEY: ${{ secrets.R2_SECRET_ACCESS_KEY }} | |
| - name: Summarize GPU canary profile | |
| shell: bash -l {0} | |
| run: | | |
| .venv/bin/python -m marin.profiling.cli summarize \ | |
| --run-target "$RUN_ID" \ | |
| --entity "$WANDB_ENTITY" \ | |
| --project "$WANDB_PROJECT" | |
| env: | |
| WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
| # `cluster stop` only deletes Pods; NodePools survive and rely on the | |
| # CW autoscaler to scale down. Delete them explicitly to avoid lingering | |
| # H100 costs. We attempt both even if one fails — cost protection matters | |
| # more than clean state. | |
| # TODO(#3277): Move NodePool teardown into Iris (e.g. `iris cluster stop | |
| # --delete-nodepools`) so CI doesn't leak the `iris-iris-managed` label. | |
| - name: Tear down CoreWeave cluster | |
| if: always() | |
| run: | | |
| .venv/bin/iris -v --config=lib/iris/examples/coreweave.yaml cluster stop || true | |
| if [ "${{ inputs.keep_nodepool }}" != "true" ]; then | |
| kubectl --kubeconfig ~/.kube/coreweave-iris \ | |
| delete nodepool -l iris-iris-managed=true | |
| else | |
| echo "Keeping node pool alive (keep_nodepool=true)" | |
| fi |