Skip to content

fix race conditions in distributed checkpoint loading and prevent memory leaks in gradient gathering #2738

fix race conditions in distributed checkpoint loading and prevent memory leaks in gradient gathering

fix race conditions in distributed checkpoint loading and prevent memory leaks in gradient gathering #2738

Workflow file for this run

name: CI
on:
push:
branches:
- main
pull_request:
jobs:
block-fixup:
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Block Fixup Commit Merge
uses: 13rac1/[email protected]
lint-and-format:
runs-on: ubuntu-latest
strategy:
matrix:
python-version:
- "3.11"
- "3.12"
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Setup uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
cache-dependency-glob: "uv.lock"
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: uv sync --all-extras --dev
- name: Run Ruff Lint
run: uv run ruff check
- name: Run Ruff Format
run: uv run ruff format --check
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version:
- "3.11"
- "3.12"
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
R2_GRADIENTS_ACCOUNT_ID: ${{ secrets.R2_GRADIENTS_ACCOUNT_ID }}
R2_GRADIENTS_BUCKET_NAME: ${{ secrets.R2_GRADIENTS_BUCKET_NAME }}
R2_GRADIENTS_READ_ACCESS_KEY_ID: ${{ secrets.R2_GRADIENTS_READ_ACCESS_KEY_ID }}
R2_GRADIENTS_READ_SECRET_ACCESS_KEY: ${{ secrets.R2_GRADIENTS_READ_SECRET_ACCESS_KEY }}
R2_GRADIENTS_WRITE_ACCESS_KEY_ID: ${{ secrets.R2_GRADIENTS_WRITE_ACCESS_KEY_ID }}
R2_GRADIENTS_WRITE_SECRET_ACCESS_KEY: ${{ secrets.R2_GRADIENTS_WRITE_SECRET_ACCESS_KEY }}
R2_DATASET_ACCOUNT_ID: ${{ secrets.R2_DATASET_ACCOUNT_ID }}
R2_DATASET_BUCKET_NAME: ${{ secrets.R2_DATASET_BUCKET_NAME }}
R2_DATASET_READ_ACCESS_KEY_ID: ${{ secrets.R2_DATASET_READ_ACCESS_KEY_ID }}
R2_DATASET_READ_SECRET_ACCESS_KEY: ${{ secrets.R2_DATASET_READ_SECRET_ACCESS_KEY }}
R2_DATASET_WRITE_ACCESS_KEY_ID: ${{ secrets.R2_DATASET_WRITE_ACCESS_KEY_ID }}
R2_DATASET_WRITE_SECRET_ACCESS_KEY: ${{ secrets.R2_DATASET_WRITE_SECRET_ACCESS_KEY }}
R2_AGGREGATOR_ACCOUNT_ID: ${{ secrets.R2_AGGREGATOR_ACCOUNT_ID }}
R2_AGGREGATOR_BUCKET_NAME: ${{ secrets.R2_AGGREGATOR_BUCKET_NAME }}
R2_AGGREGATOR_READ_ACCESS_KEY_ID: ${{ secrets.R2_AGGREGATOR_READ_ACCESS_KEY_ID }}
R2_AGGREGATOR_READ_SECRET_ACCESS_KEY: ${{ secrets.R2_AGGREGATOR_READ_SECRET_ACCESS_KEY }}
R2_DATASET_BUCKET_LIST: ${{ secrets.R2_DATASET_BUCKET_LIST }}
DATASET_BINS_PATH: ${{ secrets.DATASET_BINS_PATH }}
steps:
- name: Checkout Repository
uses: actions/checkout@v4
- name: Create .env file
run: |
cat << EOF > .env
# Hugging Face Token
HF_TOKEN=${{ secrets.HF_TOKEN }}
# Cloudflare R2 Credentials
R2_GRADIENTS_ACCOUNT_ID=${{ secrets.R2_GRADIENTS_ACCOUNT_ID }}
R2_GRADIENTS_BUCKET_NAME=${{ secrets.R2_GRADIENTS_BUCKET_NAME }}
R2_GRADIENTS_READ_ACCESS_KEY_ID=${{ secrets.R2_GRADIENTS_READ_ACCESS_KEY_ID }}
R2_GRADIENTS_READ_SECRET_ACCESS_KEY=${{ secrets.R2_GRADIENTS_READ_SECRET_ACCESS_KEY }}
R2_GRADIENTS_WRITE_ACCESS_KEY_ID=${{ secrets.R2_GRADIENTS_WRITE_ACCESS_KEY_ID }}
R2_GRADIENTS_WRITE_SECRET_ACCESS_KEY=${{ secrets.R2_GRADIENTS_WRITE_SECRET_ACCESS_KEY }}
R2_DATASET_ACCOUNT_ID=${{ secrets.R2_DATASET_ACCOUNT_ID }}
R2_DATASET_BUCKET_NAME=${{ secrets.R2_DATASET_BUCKET_NAME }}
R2_DATASET_READ_ACCESS_KEY_ID=${{ secrets.R2_DATASET_READ_ACCESS_KEY_ID }}
R2_DATASET_READ_SECRET_ACCESS_KEY=${{ secrets.R2_DATASET_READ_SECRET_ACCESS_KEY }}
R2_DATASET_WRITE_ACCESS_KEY_ID=${{ secrets.R2_DATASET_WRITE_ACCESS_KEY_ID }}
R2_DATASET_WRITE_SECRET_ACCESS_KEY=${{ secrets.R2_DATASET_WRITE_SECRET_ACCESS_KEY }}
R2_DATASET_BUCKET_LIST=${{ secrets.R2_DATASET_BUCKET_LIST }}
R2_AGGREGATOR_ACCOUNT_ID=${{ secrets.R2_AGGREGATOR_ACCOUNT_ID }}
R2_AGGREGATOR_BUCKET_NAME=${{ secrets.R2_AGGREGATOR_BUCKET_NAME }}
R2_AGGREGATOR_READ_ACCESS_KEY_ID=${{ secrets.R2_AGGREGATOR_READ_ACCESS_KEY_ID }}
R2_AGGREGATOR_READ_SECRET_ACCESS_KEY=${{ secrets.R2_AGGREGATOR_READ_SECRET_ACCESS_KEY }}
EOF
- name: Setup uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: true
cache-dependency-glob: "uv.lock"
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: uv sync --all-extras --dev
- name: Run Tests with Coverage
run: |
uv run pytest tests/ -v --cov=src --cov-report=xml --cov-report=term
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ./coverage.xml
fail_ci_if_error: true