fix(canary): lower GPU canary batch size to 16 to fix OOM #2076
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Fray - Tests | |
| on: | |
| push: | |
| branches: | |
| - main | |
| pull_request: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| changes: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| pull-requests: read | |
| outputs: | |
| should_run: ${{ steps.filter.outputs.relevant }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: dorny/paths-filter@v3 | |
| id: filter | |
| with: | |
| filters: | | |
| relevant: | |
| - 'lib/fray/**' | |
| - 'uv.lock' | |
| - '.github/workflows/fray-unit-tests.yaml' | |
| fray-tests: | |
| needs: changes | |
| if: needs.changes.outputs.should_run == 'true' | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v3 | |
| - name: Set up Python 3.12 | |
| uses: actions/setup-python@v4 | |
| with: | |
| python-version: "3.12" | |
| - name: Install uv | |
| uses: astral-sh/setup-uv@v7 | |
| with: | |
| enable-cache: true | |
| - name: Test fray | |
| run: | | |
| # NOTE: do not use frozen here because it makes ray sad when it sets up venvs with uv | |
| cd lib/fray && uv run --group=fray-test pytest --durations=5 --tb=short -m 'not slow and not tpu_ci' -v -s tests/ | |
| fray-tpu-tests: | |
| needs: changes | |
| runs-on: [tpu-ci] | |
| timeout-minutes: 10 | |
| if: needs.changes.outputs.should_run == 'true' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v3 | |
| with: | |
| ref: ${{ (github.event_name == 'pull_request_review' && format('refs/pull/{0}/merge', github.event.pull_request.number)) || '' }} | |
| - name: Run TPU tests in Docker | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
| run: | | |
| bash infra/tpu-ci/clean-tpu.sh | |
| DOCKER_IMAGE="ghcr.io/marin-community/marin/tpu-ci:latest" | |
| echo "Using Docker image: $DOCKER_IMAGE" | |
| # Create UV cache directory | |
| mkdir -p /tmp/uv-cache | |
| chmod 777 /tmp/uv-cache | |
| docker run --rm \ | |
| --device /dev/vfio:/dev/vfio \ | |
| --shm-size=100g \ | |
| --stop-timeout=5 \ | |
| --cap-add=SYS_RESOURCE \ | |
| --ulimit memlock=68719476736:68719476736 \ | |
| -e HF_TOKEN \ | |
| -e JAX_COORDINATOR_ADDRESS=127.0.0.1 \ | |
| -e JAX_PLATFORMS=tpu,cpu \ | |
| -e PJRT_DEVICE=TPU \ | |
| -e TPU_MIN_LOG_LEVEL=3 \ | |
| -e TPU_STDERR_LOG_LEVEL=3 \ | |
| -e UV_CACHE_DIR=/tmp/uv-cache \ | |
| -e WANDB_API_KEY \ | |
| -e WANDB_MODE=offline \ | |
| -v ${{ github.workspace }}:/workspace-src:ro \ | |
| -v /tmp/uv-cache:/tmp/uv-cache:rw \ | |
| -w /workspace \ | |
| $DOCKER_IMAGE \ | |
| bash -c "cp -a /workspace-src/. /workspace/ && cd /workspace/lib/fray && timeout --kill-after=5 --signal=TERM 590 uv run --group=fray-tpu-test pytest tests/ -v --tb=short -s --log-cli-level=INFO -m tpu_ci" |