Skip to content

[hd256] Add TMA paged KV support to SM100 2CTA forward kernel (#2489) #1

[hd256] Add TMA paged KV support to SM100 2CTA forward kernel (#2489)

[hd256] Add TMA paged KV support to SM100 2CTA forward kernel (#2489) #1

Workflow file for this run

name: CI
on:
push:
branches: [main, ci-fix]
permissions:
contents: read
env:
CI_WORK_DIR: ${{ vars.CI_WORK_DIR || format('/scratch/user/{0}', github.actor) }}
FA4_TEST_FILTER: "1024-1024-128-True-0-0.0-False-False-False-mha-dtype0 or 1024-1024-128-False-0-0.0-False-False-False-mha-dtype0"
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install ruff
run: pip install ruff
- name: Ruff check
run: ruff check flash_attn/cute/ --extend-exclude "flash_attn/cute/flash_bwd.py,flash_attn/cute/flash_fwd.py,flash_attn/cute/flash_fwd_sm100.py,flash_attn/cute/interface.py"
- name: Ruff format
run: ruff format --check flash_attn/cute/ --exclude "flash_attn/cute/flash_bwd.py,flash_attn/cute/flash_fwd.py,flash_attn/cute/flash_fwd_sm100.py,flash_attn/cute/interface.py"
fa4-correctness-and-benchmark:
strategy:
fail-fast: false
matrix:
gpu: [b200]
runs-on: [self-hosted, '${{ matrix.gpu }}']
name: fa4-correctness-and-benchmark (${{ matrix.gpu }})
timeout-minutes: 60
steps:
- uses: actions/checkout@v4
- name: Log in to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- uses: ./.github/actions/gpu-test
with:
test-filter: ${{ env.FA4_TEST_FILTER }}
fa4_image_cu129: "togethercomputer/training-performance:flash-attn-cu12.9-26.03.25@sha256:304a5c3d2b3a75b151cd2a964cd26d444e0d8b5686d63943df13378c9705f943"
fa4_image_cu130: "togethercomputer/training-performance:flash-attn-cu13.0-26.04.01@sha256:56e50b056eb4d671410846c3483e843ee7bd0f5b13cb45b6f0d7eb8bd27694a5"