Skip to content

fix(canary): lower GPU canary batch size to 16 to fix OOM #2076

fix(canary): lower GPU canary batch size to 16 to fix OOM

fix(canary): lower GPU canary batch size to 16 to fix OOM #2076

name: Fray - Tests
on:
push:
branches:
- main
pull_request:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
changes:
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: read
outputs:
should_run: ${{ steps.filter.outputs.relevant }}
steps:
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
relevant:
- 'lib/fray/**'
- 'uv.lock'
- '.github/workflows/fray-unit-tests.yaml'
fray-tests:
needs: changes
if: needs.changes.outputs.should_run == 'true'
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Set up Python 3.12
uses: actions/setup-python@v4
with:
python-version: "3.12"
- name: Install uv
uses: astral-sh/setup-uv@v7
with:
enable-cache: true
- name: Test fray
run: |
# NOTE: do not use frozen here because it makes ray sad when it sets up venvs with uv
cd lib/fray && uv run --group=fray-test pytest --durations=5 --tb=short -m 'not slow and not tpu_ci' -v -s tests/
fray-tpu-tests:
needs: changes
runs-on: [tpu-ci]
timeout-minutes: 10
if: needs.changes.outputs.should_run == 'true' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository)
steps:
- name: Checkout code
uses: actions/checkout@v3
with:
ref: ${{ (github.event_name == 'pull_request_review' && format('refs/pull/{0}/merge', github.event.pull_request.number)) || '' }}
- name: Run TPU tests in Docker
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
run: |
bash infra/tpu-ci/clean-tpu.sh
DOCKER_IMAGE="ghcr.io/marin-community/marin/tpu-ci:latest"
echo "Using Docker image: $DOCKER_IMAGE"
# Create UV cache directory
mkdir -p /tmp/uv-cache
chmod 777 /tmp/uv-cache
docker run --rm \
--device /dev/vfio:/dev/vfio \
--shm-size=100g \
--stop-timeout=5 \
--cap-add=SYS_RESOURCE \
--ulimit memlock=68719476736:68719476736 \
-e HF_TOKEN \
-e JAX_COORDINATOR_ADDRESS=127.0.0.1 \
-e JAX_PLATFORMS=tpu,cpu \
-e PJRT_DEVICE=TPU \
-e TPU_MIN_LOG_LEVEL=3 \
-e TPU_STDERR_LOG_LEVEL=3 \
-e UV_CACHE_DIR=/tmp/uv-cache \
-e WANDB_API_KEY \
-e WANDB_MODE=offline \
-v ${{ github.workspace }}:/workspace-src:ro \
-v /tmp/uv-cache:/tmp/uv-cache:rw \
-w /workspace \
$DOCKER_IMAGE \
bash -c "cp -a /workspace-src/. /workspace/ && cd /workspace/lib/fray && timeout --kill-after=5 --signal=TERM 590 uv run --group=fray-tpu-test pytest tests/ -v --tb=short -s --log-cli-level=INFO -m tpu_ci"