[datasets] Add Hermes trace support to the SFT pipeline #5895
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Levanter - Tests | |
| on: | |
| push: | |
| branches: | |
| - main | |
| pull_request: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| changes: | |
| runs-on: ubuntu-latest | |
| permissions: | |
| contents: read | |
| pull-requests: read | |
| outputs: | |
| should_run: ${{ steps.filter.outputs.relevant }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - uses: dorny/paths-filter@v3 | |
| id: filter | |
| with: | |
| filters: | | |
| relevant: | |
| - 'lib/levanter/**' | |
| - 'lib/haliax/**' | |
| - 'uv.lock' | |
| - '.github/workflows/levanter-tests.yaml' | |
| levanter-tests: | |
| needs: changes | |
| if: needs.changes.outputs.should_run == 'true' | |
| runs-on: ubuntu-latest | |
| defaults: | |
| run: | |
| working-directory: lib/levanter | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install uv and Python | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "0.7.20" | |
| python-version: "3.11" | |
| enable-cache: true | |
| working-directory: lib/levanter | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "22" | |
| - name: Set up Python | |
| run: uv python install | |
| - name: Install dependencies | |
| run: uv sync --package marin-levanter --dev --group test --frozen | |
| - name: Test with pytest | |
| run: | | |
| # Test with specific JAX version, excluding TPU tests | |
| PYTHONPATH=tests:src:. uv run --package marin-levanter --frozen --with "jax[cpu]==0.8.0" pytest tests -m "not entry and not slow and not ray and not tpu" --durations=20 | |
| levanter-ray-tests: | |
| needs: changes | |
| if: needs.changes.outputs.should_run == 'true' | |
| runs-on: ubuntu-latest | |
| defaults: | |
| run: | |
| working-directory: lib/levanter | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install uv and Python | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "0.7.20" | |
| python-version: "3.11" | |
| enable-cache: true | |
| working-directory: lib/levanter | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "22" | |
| - name: Set up Python | |
| run: uv python install | |
| - name: Install dependencies | |
| run: uv sync --package marin-levanter --dev --group test --frozen | |
| - name: Test with pytest | |
| run: | | |
| PYTHONPATH=tests:src:. uv run --package marin-levanter --frozen pytest tests -m "ray" --durations=20 | |
| levanter-entry-tests: | |
| needs: changes | |
| if: needs.changes.outputs.should_run == 'true' | |
| runs-on: ubuntu-latest | |
| defaults: | |
| run: | |
| working-directory: lib/levanter | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install uv and Python | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "0.7.20" | |
| python-version: "3.11" | |
| enable-cache: true | |
| working-directory: lib/levanter | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "22" | |
| - name: Set up Python | |
| run: uv python install | |
| - name: Install dependencies | |
| run: uv sync --package marin-levanter --dev --group test --frozen | |
| - name: Test with pytest | |
| run: | | |
| PYTHONPATH=tests:src:. uv run --package marin-levanter --frozen pytest tests -m "entry" --durations=20 | |
| levanter-torch-tests: | |
| needs: changes | |
| if: needs.changes.outputs.should_run == 'true' | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 15 | |
| defaults: | |
| run: | |
| working-directory: lib/levanter | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Install uv and Python | |
| uses: astral-sh/setup-uv@v6 | |
| with: | |
| version: "0.7.20" | |
| python-version: "3.11" | |
| enable-cache: true | |
| working-directory: lib/levanter | |
| - name: Set up Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "22" | |
| - name: Set up Python | |
| run: uv python install | |
| - name: Install dependencies | |
| run: uv sync --package marin-levanter --dev --group test --frozen --extra torch_test --no-install-package torch | |
| - name: Install locked CPU torch wheel | |
| run: | | |
| TORCH_CPU_VERSION=$(python3 - <<'PY' | |
| import tomllib | |
| from pathlib import Path | |
| lock = tomllib.loads(Path("../../uv.lock").read_text()) | |
| versions = { | |
| package["version"] | |
| for package in lock.get("package", []) | |
| if package.get("name") == "torch" and package.get("version", "").endswith("+cpu") | |
| } | |
| if len(versions) != 1: | |
| raise SystemExit( | |
| f"Expected exactly one torch +cpu version in uv.lock, found {len(versions)}: {sorted(versions)}" | |
| ) | |
| print(next(iter(versions))) | |
| PY | |
| ) | |
| echo "Using torch ${TORCH_CPU_VERSION}" | |
| if [[ -z "$TORCH_CPU_VERSION" ]]; then | |
| echo "Unable to determine torch CPU version from uv.lock" >&2 | |
| exit 1 | |
| fi | |
| uv pip install "torch==$TORCH_CPU_VERSION" \ | |
| --default-index https://download.pytorch.org/whl/cpu \ | |
| --index https://pypi.org/simple \ | |
| --index-strategy unsafe-best-match | |
| - name: Test torch-marked suite | |
| run: | | |
| # run directly on CPU to avoid GPU dependency issues | |
| CUDA_VISIBLE_DEVICES="" PYTHONPATH=tests:src:. ../../.venv/bin/python -m pytest tests -m "torch" --durations=20 | |
| # CUDA_VISIBLE_DEVICES="" PYTHONPATH=tests:src:. uv run --package levanter --frozen --extra torch_test pytest tests -m "torch" --durations=20 | |
| levanter-tpu-tests: | |
| needs: changes | |
| if: needs.changes.outputs.should_run == 'true' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) | |
| runs-on: [tpu-ci] | |
| timeout-minutes: 15 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ (github.event_name == 'pull_request_review' && format('refs/pull/{0}/merge', github.event.pull_request.number)) || '' }} | |
| - name: Run Levanter TPU tests in Docker | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }} | |
| JAX_VERSION: "0.8.0" | |
| run: | | |
| bash infra/tpu-ci/clean-tpu.sh | |
| DOCKER_IMAGE="ghcr.io/marin-community/marin/tpu-ci:latest" | |
| echo "Using Docker image: $DOCKER_IMAGE" | |
| # Create UV cache directory | |
| mkdir -p /tmp/uv-cache | |
| chmod 777 /tmp/uv-cache | |
| # Run Levanter tests with specific JAX version | |
| # TPU collection imports call jax.devices(), so keep pytest single-process here. | |
| docker run --rm \ | |
| --device /dev/vfio:/dev/vfio \ | |
| --shm-size=100g \ | |
| --stop-timeout=5 \ | |
| --cap-add=SYS_RESOURCE \ | |
| --ulimit memlock=68719476736:68719476736 \ | |
| -e TPU_CI=true \ | |
| -e JAX_COORDINATOR_ADDRESS=127.0.0.1 \ | |
| -e TPU_STDERR_LOG_LEVEL=3 \ | |
| -e TPU_MIN_LOG_LEVEL=3 \ | |
| -e PYTHONPATH=/workspace \ | |
| -e JAX_PLATFORMS=tpu,cpu \ | |
| -e PJRT_DEVICE=TPU \ | |
| -e HF_TOKEN \ | |
| -e WANDB_API_KEY \ | |
| -e WANDB_MODE=offline \ | |
| -e UV_CACHE_DIR=/tmp/uv-cache \ | |
| -v ${{ github.workspace }}:/workspace-src:ro \ | |
| -v /tmp/uv-cache:/tmp/uv-cache:rw \ | |
| -w /workspace \ | |
| $DOCKER_IMAGE \ | |
| bash -c "\ | |
| # Install Node.js in userspace if not present (needed for protobuf generation during uv sync) | |
| if ! command -v npx >/dev/null 2>&1; then \ | |
| echo '::group::Installing Node.js in userspace'; \ | |
| curl -fsSL https://nodejs.org/dist/v22.16.0/node-v22.16.0-linux-x64.tar.xz | tar -xJ -C /tmp && \ | |
| export PATH=/tmp/node-v22.16.0-linux-x64/bin:\$PATH; \ | |
| echo '::endgroup::'; \ | |
| fi && \ | |
| cp -a /workspace-src/. /workspace/ && cd /workspace && \ | |
| timeout --kill-after=5 --signal=TERM 890 \ | |
| uv run --package marin-levanter --frozen --group test --with 'jax[tpu]==$JAX_VERSION' \ | |
| pytest -n 0 lib/levanter/tests \ | |
| -m 'not entry and not ray and not slow and not torch' \ | |
| --ignore=lib/levanter/tests/test_audio.py \ | |
| --ignore=lib/levanter/tests/test_new_cache.py \ | |
| --ignore=lib/levanter/tests/test_hf_checkpoints.py \ | |
| --ignore=lib/levanter/tests/test_hf_gpt2_serialize.py \ | |
| --ignore=lib/levanter/tests/test_gdn_layer.py \ | |
| -v --tb=short --log-cli-level=WARNING --durations=20" |