Aiter Test #16
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Aiter Test | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| types: [opened, synchronize, reopened, ready_for_review] | |
| branches: [main] # Triggers on PRs targeting `main` | |
| paths-ignore: | |
| - '**/*.md' | |
| - 'docs/**' | |
| - 'LICENSE' | |
| - '.gitignore' | |
| workflow_dispatch: | |
| schedule: | |
| - cron: '0 22 * * *' # 6:00 AM Beijing Time (UTC+8) | |
| concurrency: | |
| # Keep scheduled main runs from blocking push-triggered validation. | |
| group: ${{ github.workflow }}-${{ github.ref }}-${{ github.event_name }} | |
| cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} | |
| env: | |
| DOCKER_IMAGE: "rocm/pytorch:latest" | |
| GPU_ARCH_LIST: "gfx942;gfx950" | |
| GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/ROCm/aiter.git' }} | |
| GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id || github.sha }} | |
| AITER_TEST: "op_tests" | |
| jobs: | |
| check-signal: | |
| if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Download and check signal artifact | |
| run: ./.github/scripts/check_signal.sh | |
| env: | |
| GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| GITHUB_SHA: ${{ github.sha }} | |
| build_aiter_image: | |
| if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} | |
| runs-on: build-only-aiter | |
| needs: check-signal | |
| permissions: | |
| id-token: write | |
| contents: read | |
| steps: | |
| - name: Checkout code | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| uses: actions/checkout@v4 | |
| # - name: Prepare docker config | |
| # run: | | |
| # export DOCKER_CONFIG="$HOME/.docker" | |
| # mkdir -p "$DOCKER_CONFIG" || true | |
| # cp /docker-config/config.json "$DOCKER_CONFIG/config.json" | |
| # echo "DOCKER_CONFIG=$DOCKER_CONFIG" >> "$GITHUB_ENV" | |
| - name: Generate Dockerfile | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| cat <<EOF > Dockerfile.mod | |
| FROM ${{ env.DOCKER_IMAGE }} | |
| RUN echo "=== Aiter version BEFORE uninstall ===" && pip show amd-aiter || true | |
| RUN pip uninstall -y aiter | |
| RUN pip install --upgrade pandas zmq einops numpy==1.26.2 | |
| RUN pip install --upgrade "pybind11>=3.0.1" | |
| RUN pip install --upgrade "ninja>=1.11.1" | |
| RUN pip install --upgrade "setuptools_scm<9" | |
| RUN pip install tabulate | |
| RUN pip list | |
| RUN rm -rf aiter \ | |
| && git clone ${{ env.GITHUB_REPO_URL }} aiter \ | |
| && cd aiter \ | |
| && git checkout ${{ env.GITHUB_COMMIT_SHA }} \ | |
| && if [ "${{ github.event_name }}" = "schedule" ]; then \ | |
| echo "It's nightly build, syncing latest CK..."; \ | |
| git submodule set-branch --branch develop 3rdparty/composable_kernel; \ | |
| git submodule sync && \ | |
| git submodule update --init --recursive --remote --jobs 4; \ | |
| echo "Nightly CK commit: $(git -C 3rdparty/composable_kernel rev-parse HEAD)"; \ | |
| else \ | |
| echo "Using pinned CK commit..."; \ | |
| git submodule sync && \ | |
| git submodule update --init --recursive --depth 1 --jobs 4; \ | |
| echo "Pinned CK commit: $(git -C 3rdparty/composable_kernel rev-parse HEAD)"; \ | |
| fi \ | |
| && pip install -r requirements.txt \ | |
| && echo "Prebuilding kernels with GPU_ARCHS: ${{ env.GPU_ARCH_LIST }}, PREBUILD_KERNELS: 1, and MAX_JOBS: 128" \ | |
| && PREBUILD_KERNELS=1 MAX_JOBS=128 GPU_ARCHS="${{ env.GPU_ARCH_LIST }}" python setup.py bdist_wheel \ | |
| && pip install dist/*.whl \ | |
| && echo "Prebuilding kernels completed" | |
| RUN echo "=== Aiter version AFTER installation ===" && pip show amd-aiter || true | |
| EOF | |
| - name: Show Dockerfile | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: cat Dockerfile.mod | |
| - name: Build Docker image | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| docker build --network=host --no-cache -t $IMAGE_TAG -f Dockerfile.mod . | |
| - name: Verify prebuilt kernels | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| echo "=== Prebuilt kernel validation ===" | |
| KERNEL_COUNT=$(docker run --rm $IMAGE_TAG find /aiter/aiter/jit -name "*.so" | wc -l) | |
| echo "Prebuilt kernel .so files: $KERNEL_COUNT" | |
| docker run --rm $IMAGE_TAG find /aiter/aiter/jit -name "*.so" | sort | |
| if [ "$KERNEL_COUNT" -lt 10 ]; then | |
| echo "::warning::Prebuild may have failed: expected at least 10 kernel .so files, found $KERNEL_COUNT. This can cause JIT compilation and OOM at runtime." | |
| else | |
| echo "Prebuild validation passed: $KERNEL_COUNT kernels compiled" | |
| fi | |
| - name: Push Docker image | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} | |
| docker push $IMAGE_TAG | |
| - name: Success message | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| echo "Successfully prepared image: rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }}" | |
| - name: Extract wheel from image | |
| if: ${{ github.ref == 'refs/heads/main' && github.event_name != 'schedule' }} | |
| run: | | |
| set -ex | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| mkdir -p dist | |
| docker run --rm \ | |
| -v "${{ github.workspace }}/dist:/dist" \ | |
| $IMAGE_TAG \ | |
| bash -c "cp /aiter/dist/*.whl /dist/" | |
| echo "Extracted wheels:" | |
| ls -lh dist/*.whl | |
| - name: Upload wheel as artifact | |
| if: ${{ github.ref == 'refs/heads/main' && github.event_name != 'schedule' }} | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: aiter-whl-main-${{ github.run_id }} | |
| path: dist/*.whl | |
| retention-days: 14 | |
| - name: Configure AWS credentials | |
| if: ${{ github.ref == 'refs/heads/main' && github.event_name != 'schedule' }} | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| aws-region: us-east-1 | |
| role-to-assume: arn:aws:iam::661452401056:role/framework-aiter-nightlies | |
| - name: Install AWS CLI | |
| if: ${{ github.ref == 'refs/heads/main' && github.event_name != 'schedule' }} | |
| run: | | |
| if ! command -v aws &> /dev/null; then | |
| curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" | |
| unzip -q awscliv2.zip | |
| sudo ./aws/install | |
| rm -rf awscliv2.zip aws | |
| fi | |
| - name: Upload wheels and latest main manifest to S3 | |
| if: ${{ github.ref == 'refs/heads/main' && github.event_name != 'schedule' }} | |
| run: | | |
| for WHL in dist/*.whl; do | |
| WHL_NAME=$(basename ${WHL}) | |
| echo "Uploading ${WHL_NAME} to S3..." | |
| aws s3 cp ${WHL} s3://framework-whls-nightlies/whl-staging/gfx942-gfx950/${WHL_NAME} | |
| done | |
| echo "Wheels uploaded to S3 staging" | |
| MANIFEST_WHL=$(ls -t dist/amd_aiter*.whl 2>/dev/null | head -1) | |
| if [ -z "$MANIFEST_WHL" ]; then | |
| echo "ERROR: No amd_aiter wheel found in dist/" | |
| exit 1 | |
| fi | |
| MANIFEST_WHL_NAME=$(basename "$MANIFEST_WHL") | |
| MANIFEST_WHL_URL="https://rocm.frameworks-nightlies.amd.com/whl-staging/gfx942-gfx950/${MANIFEST_WHL_NAME}" | |
| MANIFEST_TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | |
| python3 -c "import json, pathlib, sys; pathlib.Path('latest-main-wheel.json').write_text(json.dumps({'branch': sys.argv[1], 'timestamp': sys.argv[2], 'commit': sys.argv[3], 'wheel_name': sys.argv[4], 'wheel_url': sys.argv[5]}, indent=2) + '\n', encoding='utf-8')" \ | |
| "$GITHUB_REF_NAME" "$MANIFEST_TIMESTAMP" "$GITHUB_SHA" "$MANIFEST_WHL_NAME" "$MANIFEST_WHL_URL" | |
| aws s3 cp latest-main-wheel.json \ | |
| s3://framework-whls-nightlies/whl-staging/gfx942-gfx950/main/latest.json \ | |
| --content-type application/json | |
| echo "Uploaded latest main wheel manifest for ${MANIFEST_WHL_NAME}" | |
| split_aiter_tests: | |
| if: ${{ !github.event.pull_request || github.event.pull_request.draft == false }} | |
| runs-on: ubuntu-latest | |
| needs: [check-signal, build_aiter_image] | |
| outputs: | |
| shard_count: 5 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Split Aiter Tests (5 shards) | |
| run: ./.github/scripts/split_tests.sh --shards 5 --test-type aiter | |
| - name: Upload test shard lists as artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: aiter_shards | |
| path: aiter_shard_*.list | |
| retention-days: 7 | |
| standard: | |
| if: >- | |
| (!github.event.pull_request || github.event.pull_request.draft == false) && | |
| github.event.action != 'labeled' | |
| name: Standard Tests (1 GPU) | |
| needs: [build_aiter_image, split_aiter_tests] | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - runner: linux-aiter-mi35x-1 | |
| label: MI35X | |
| shard_total: 5 | |
| shard_idx: 0 | |
| - runner: linux-aiter-mi35x-1 | |
| label: MI35X | |
| shard_total: 5 | |
| shard_idx: 1 | |
| - runner: linux-aiter-mi35x-1 | |
| label: MI35X | |
| shard_total: 5 | |
| shard_idx: 2 | |
| - runner: linux-aiter-mi35x-1 | |
| label: MI35X | |
| shard_total: 5 | |
| shard_idx: 3 | |
| - runner: linux-aiter-mi35x-1 | |
| label: MI35X | |
| shard_total: 5 | |
| shard_idx: 4 | |
| - runner: aiter-1gpu-runner | |
| label: MI325 | |
| shard_total: 5 | |
| shard_idx: 0 | |
| - runner: aiter-1gpu-runner | |
| label: MI325 | |
| shard_total: 5 | |
| shard_idx: 1 | |
| - runner: aiter-1gpu-runner | |
| label: MI325 | |
| shard_total: 5 | |
| shard_idx: 2 | |
| - runner: aiter-1gpu-runner | |
| label: MI325 | |
| shard_total: 5 | |
| shard_idx: 3 | |
| - runner: aiter-1gpu-runner | |
| label: MI325 | |
| shard_total: 5 | |
| shard_idx: 4 | |
| runs-on: ${{ matrix.runner }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha || github.sha }} | |
| - name: Docker login | |
| run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true | |
| - name: Download test shard lists | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: aiter_shards | |
| - name: List test shard files | |
| run: | | |
| ls -l aiter_shard_*.list | |
| - name: Export test file list for this shard as env | |
| id: set_shard_files | |
| run: | | |
| echo "AITER_TEST=$(cat aiter_shard_${{ matrix.shard_idx }}.list)" >> $GITHUB_ENV | |
| echo "$AITER_TEST" | |
| - name: Run the container | |
| run: | | |
| set -ex | |
| echo "Starting container: aiter_test" | |
| if [ -f "/etc/podinfo/gha-render-devices" ]; then | |
| DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) | |
| else | |
| DEVICE_FLAG="--device /dev/dri" | |
| fi | |
| if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then | |
| IMAGE_TAG=${{ env.DOCKER_IMAGE }} | |
| else | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| fi | |
| docker run -dt \ | |
| --device=/dev/kfd $DEVICE_FLAG \ | |
| --shm-size=16G \ | |
| --network=host \ | |
| --group-add $(getent group render | cut -d: -f3) \ | |
| --group-add $(getent group video | cut -d: -f3) \ | |
| -e AITER_TEST="${AITER_TEST}" \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| --name aiter_test \ | |
| $IMAGE_TAG | |
| - name: Setup Aiter for fork PR | |
| if: ${{ github.event.pull_request.head.repo.fork }} | |
| run: | | |
| set -ex | |
| git submodule sync && git submodule update --init --recursive --depth 1 --jobs 4 | |
| echo "Setting up Aiter for fork PR..." | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh" | |
| - name: Restore CK from prebuilt image | |
| if: ${{ !github.event.pull_request.head.repo.fork }} | |
| run: | | |
| set -ex | |
| if [ "${{ github.event_name }}" = "schedule" ]; then | |
| echo "Nightly build: restoring CK that was synced from develop in the prebuild image..." | |
| else | |
| echo "Restoring the pinned CK checkout from the prebuild image..." | |
| fi | |
| rm -rf "${{ github.workspace }}/3rdparty/composable_kernel" | |
| rm -rf "${{ github.workspace }}/.git/modules/3rdparty/composable_kernel" | |
| mkdir -p "${{ github.workspace }}/3rdparty" | |
| mkdir -p "${{ github.workspace }}/.git/modules/3rdparty" | |
| docker cp aiter_test:/aiter/.git/modules/3rdparty/composable_kernel \ | |
| "${{ github.workspace }}/.git/modules/3rdparty/composable_kernel" | |
| docker cp aiter_test:/aiter/3rdparty/composable_kernel \ | |
| "${{ github.workspace }}/3rdparty/composable_kernel" | |
| IMAGE_CK_COMMIT=$(docker exec aiter_test git -C /aiter/3rdparty/composable_kernel rev-parse HEAD) | |
| WORKSPACE_CK_COMMIT=$(git -C "${{ github.workspace }}/3rdparty/composable_kernel" rev-parse HEAD) | |
| echo "Image CK commit: ${IMAGE_CK_COMMIT}" | |
| echo "Workspace CK commit: ${WORKSPACE_CK_COMMIT}" | |
| test "${IMAGE_CK_COMMIT}" = "${WORKSPACE_CK_COMMIT}" | |
| - name: Show Aiter version | |
| run: | | |
| set -ex | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "pip show amd-aiter || true" | |
| - name: Tests | |
| timeout-minutes: 90 | |
| run: | | |
| set -ex | |
| if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "MAX_JOBS=64 SHARD_TOTAL=${{ matrix.shard_total }} SHARD_IDX=${{ matrix.shard_idx }} ./.github/scripts/aiter_test.sh" | |
| else | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "SHARD_TOTAL=${{ matrix.shard_total }} SHARD_IDX=${{ matrix.shard_idx }} ./.github/scripts/aiter_test.sh" | |
| fi | |
| - name: Collect test logs | |
| if: always() | |
| run: | | |
| echo "Collecting test logs..." | |
| echo "Aiter Operator Tests Summary:" >> $GITHUB_STEP_SUMMARY | |
| python3 ./.github/scripts/collect_logs.py latest_test.log >> $GITHUB_STEP_SUMMARY | |
| - name: Upload test logs | |
| uses: actions/upload-artifact@v4 | |
| if: success() | |
| with: | |
| name: standard-test-log-${{ matrix.runner }}-shard-${{ matrix.shard_idx }} | |
| path: latest_test.log | |
| retention-days: 7 | |
| - name: Cleanup container | |
| if: always() | |
| run: | | |
| docker rm -f aiter_test || true | |
| standard-test-finish: | |
| if: >- | |
| !github.event.pull_request.draft && | |
| github.event.action != 'labeled' | |
| name: Standard Test Results | |
| runs-on: ubuntu-latest | |
| needs: [standard] | |
| steps: | |
| - name: Download all test logs | |
| uses: actions/download-artifact@v4 | |
| with: | |
| pattern: standard-test-log-*-shard-* | |
| path: . | |
| - name: List test logs | |
| run: | | |
| ls -l standard-test-log-* | |
| - name: Check Standard Test Results | |
| run: | | |
| set -ex | |
| echo "Checking Standard Test Results..." | |
| all_passed=true | |
| for shard in {0..4}; do | |
| for runner in {linux-aiter-mi35x-1,aiter-1gpu-runner}; do | |
| if [ ! -f standard-test-log-${runner}-shard-${shard}/latest_test.log ]; then | |
| echo "Test report for ${runner} shard ${shard} not found." | |
| all_passed=false | |
| break | |
| fi | |
| done | |
| done | |
| if [ "$all_passed" = true ]; then | |
| echo "All tests passed." | |
| else | |
| echo "Test failures or errors detected." | |
| exit 1 | |
| fi | |
| multi-gpu: | |
| name: Multi-GPU Tests (8 GPU) | |
| if: github.ref == 'refs/heads/main' | |
| needs: build_aiter_image | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| include: | |
| - runner: linux-aiter-mi35x-8 | |
| label: MI35X | |
| - runner: aiter-8gpu-runner | |
| label: MI325 | |
| runs-on: ${{ matrix.runner }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| - name: Docker login | |
| run: docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true | |
| - name: Run the container | |
| run: | | |
| set -ex | |
| echo "Starting container: aiter_test" | |
| if [ -f "/etc/podinfo/gha-render-devices" ]; then | |
| DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices) | |
| else | |
| DEVICE_FLAG="--device /dev/dri" | |
| fi | |
| if [ "${{ github.event.pull_request.head.repo.fork }}" = "true" ]; then | |
| IMAGE_TAG=${{ env.DOCKER_IMAGE }} | |
| else | |
| IMAGE_TAG=rocm/aiter-ci:pre-build-${{ env.GITHUB_COMMIT_SHA }} | |
| fi | |
| docker run -dt \ | |
| --device=/dev/kfd $DEVICE_FLAG \ | |
| --shm-size=16G \ | |
| --network=host \ | |
| --group-add $(getent group render | cut -d: -f3) \ | |
| --group-add $(getent group video | cut -d: -f3) \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| --name aiter_test \ | |
| $IMAGE_TAG | |
| - name: Setup Aiter for fork PR | |
| if: ${{ github.event.pull_request.head.repo.fork }} | |
| run: | | |
| set -ex | |
| echo "Setting up Aiter for fork PR..." | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "BUILD_TRITON=0 ./.github/scripts/build_aiter_triton.sh" | |
| - name: Show Aiter version | |
| run: | | |
| set -ex | |
| docker exec \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "pip show amd-aiter || true" | |
| - name: Tests | |
| timeout-minutes: 60 | |
| run: | | |
| set -ex | |
| docker exec \ | |
| -e MULTIGPU=TRUE \ | |
| -w /workspace \ | |
| aiter_test \ | |
| bash -c "./.github/scripts/aiter_test.sh" | |
| - name: Upload test logs | |
| uses: actions/upload-artifact@v4 | |
| if: always() | |
| with: | |
| name: multigpu-test-${{ matrix.runner }} | |
| path: latest_test.log | |
| retention-days: 7 | |
| - name: Cleanup container | |
| if: always() | |
| run: | | |
| docker rm -f aiter_test || true | |
| - name: Clean up Rocm processes | |
| if: always() | |
| run: | | |
| ./.github/scripts/clean_up_rocm.sh |