|
| 1 | +name: xpu-test |
| 2 | + |
| 3 | +on: |
| 4 | + workflow_dispatch: |
| 5 | + pull_request: |
| 6 | + |
| 7 | +permissions: |
| 8 | + id-token: write |
| 9 | + contents: read |
| 10 | + |
| 11 | +concurrency: |
| 12 | + group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} |
| 13 | + cancel-in-progress: true |
| 14 | + |
| 15 | +jobs: |
| 16 | + test: |
| 17 | + # Don't run on forked repos or empty test matrix |
| 18 | + # if: github.repository_owner == 'meta-pytorch' |
| 19 | + timeout-minutes: 120 |
| 20 | + runs-on: pvc_rolling |
| 21 | + env: |
| 22 | + DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-noble-xpu-n-py3 |
| 23 | + steps: |
| 24 | + - name: Checkout PyTorch |
| 25 | + uses: actions/checkout@v4 |
| 26 | + with: |
| 27 | + repository: pytorch/pytorch |
| 28 | + ref: nightly |
| 29 | + path: pytorch |
| 30 | + fetch-depth: 1 |
| 31 | + submodules: false |
| 32 | + |
| 33 | + - name: Checkout Torchcomms |
| 34 | + uses: actions/checkout@v4 |
| 35 | + with: |
| 36 | + path: torchcomms |
| 37 | + |
| 38 | + - name: Clean all stopped docker containers |
| 39 | + if: always() |
| 40 | + shell: bash |
| 41 | + run: | |
| 42 | + # Prune all stopped containers. |
| 43 | + # If other runner is pruning on this node, will skip. |
| 44 | + nprune=$(ps -ef | grep -c "docker container prune") |
| 45 | + if [[ $nprune -eq 1 ]]; then |
| 46 | + docker container prune -f |
| 47 | + fi |
| 48 | +
|
| 49 | + - name: Runner health check xpu-smi |
| 50 | + if: always() |
| 51 | + shell: bash |
| 52 | + run: | |
| 53 | + timeout 30 xpu-smi discovery || true |
| 54 | +
|
| 55 | + - name: Runner health check GPU count |
| 56 | + if: always() |
| 57 | + shell: bash |
| 58 | + run: | |
| 59 | + ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true) |
| 60 | + msg="Please file an issue on meta-pytorch/torchcomms reporting the faulty runner. Include a link to the runner logs so the runner can be identified" |
| 61 | + if [[ $ngpu -eq 0 ]]; then |
| 62 | + echo "Error: Failed to detect any GPUs on the runner" |
| 63 | + echo "$msg" |
| 64 | + exit 1 |
| 65 | + fi |
| 66 | +
|
| 67 | + - name: Runner diskspace health check |
| 68 | + uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main |
| 69 | + if: always() |
| 70 | + |
| 71 | + - name: Preserve github env variables for use in docker |
| 72 | + shell: bash |
| 73 | + run: | |
| 74 | + env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}" |
| 75 | + env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" |
| 76 | +
|
| 77 | + - name: XPU set GPU_FLAG |
| 78 | + shell: bash |
| 79 | + run: | |
| 80 | + # Add render group for container creation. |
| 81 | + render_gid=`cat /etc/group | grep render | cut -d: -f3` |
| 82 | + echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}" |
| 83 | +
|
| 84 | + - name: Get docker image |
| 85 | + id: get-docker-image |
| 86 | + working-directory: pytorch |
| 87 | + run: | |
| 88 | + DOCKER_BUILD_DIR=".ci/docker" |
| 89 | + DOCKER_IMAGE_NAME=${${env.DOCKER_IMAGE}-}$(git rev-parse HEAD:"${DOCKER_BUILD_DIR}") |
| 90 | + echo "docker-image=${DOCKER_IMAGE_NAME}" >> "${GITHUB_OUTPUT}" |
| 91 | +
|
| 92 | + - name: Pull docker image |
| 93 | + uses: pytorch/test-infra/.github/actions/pull-docker-image@main |
| 94 | + with: |
| 95 | + docker-image: ${{ steps.get-docker-image.outputs.docker-image }} |
| 96 | + |
| 97 | + - name: Test |
| 98 | + id: test |
| 99 | + env: |
| 100 | + TEST_COMMAND: torchcomms/.github/scripts/xpu_test.sh |
| 101 | + DOCKER_IMAGE: ${{ steps.get-docker-image.outputs.docker-image }} |
| 102 | + PR_NUMBER: ${{ github.event.pull_request.number }} |
| 103 | + GITHUB_REPOSITORY: ${{ github.repository }} |
| 104 | + GITHUB_WORKFLOW: ${{ github.workflow }} |
| 105 | + GITHUB_JOB: ${{ github.job }} |
| 106 | + GITHUB_RUN_ID: ${{ github.run_id }} |
| 107 | + GITHUB_RUN_NUMBER: ${{ github.run_number }} |
| 108 | + GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} |
| 109 | + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} |
| 110 | + timeout-minutes: 120 |
| 111 | + run: | |
| 112 | + set -x |
| 113 | +
|
| 114 | + # detached container should get cleaned up by teardown_ec2_linux |
| 115 | + # Used for GPU_FLAG since that doesn't play nice |
| 116 | + # shellcheck disable=SC2086,SC2090 |
| 117 | + container_name=$(docker run \ |
| 118 | + ${GPU_FLAG:-} \ |
| 119 | + -e PR_NUMBER \ |
| 120 | + -e GITHUB_ACTIONS \ |
| 121 | + -e GITHUB_REPOSITORY \ |
| 122 | + -e GITHUB_WORKFLOW \ |
| 123 | + -e GITHUB_JOB \ |
| 124 | + -e GITHUB_RUN_ID \ |
| 125 | + -e GITHUB_RUN_NUMBER \ |
| 126 | + -e GITHUB_RUN_ATTEMPT \ |
| 127 | + -e JOB_ID \ |
| 128 | + -e BRANCH \ |
| 129 | + -e SHA1 \ |
| 130 | + --ulimit stack=10485760:83886080 \ |
| 131 | + --ulimit core=0 \ |
| 132 | + --security-opt seccomp=unconfined \ |
| 133 | + --cap-add=SYS_PTRACE \ |
| 134 | + --shm-size="8g" \ |
| 135 | + --tty \ |
| 136 | + --detach \ |
| 137 | + --user jenkins \ |
| 138 | + --privileged \ |
| 139 | + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ |
| 140 | + -w /var/lib/jenkins/workspace \ |
| 141 | + "${DOCKER_IMAGE}" |
| 142 | + ) |
| 143 | + # save container name for later step |
| 144 | + echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV" |
| 145 | + # jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home |
| 146 | + docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}" |
| 147 | +
|
| 148 | + - name: Collect backtraces from coredumps (if any) |
| 149 | + if: always() |
| 150 | + run: | |
| 151 | + # shellcheck disable=SC2156 |
| 152 | + find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; |
| 153 | +
|
| 154 | + - name: Stop container before exit |
| 155 | + if: always() |
| 156 | + run: | |
| 157 | + # Workaround for multiple runners on same IDC node |
| 158 | + docker stop "${{ env.CONTAINER_NAME }}" |
| 159 | +
|
| 160 | + - name: Store Core dumps on GitHub |
| 161 | + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 |
| 162 | + if: failure() |
| 163 | + with: |
| 164 | + name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} |
| 165 | + retention-days: 14 |
| 166 | + if-no-files-found: ignore |
| 167 | + path: ./**/core.[1-9]* |
| 168 | + |
| 169 | + - name: Teardown XPU |
| 170 | + if: always() |
| 171 | + shell: bash |
| 172 | + run: | |
| 173 | + # Prune all stopped containers. |
| 174 | + # If other runner is pruning on this node, will skip. |
| 175 | + nprune=$(ps -ef | grep -c "docker container prune") |
| 176 | + if [[ $nprune -eq 1 ]]; then |
| 177 | + docker container prune -f |
| 178 | + fi |
| 179 | + |
| 180 | + - name: Runner diskspace health check |
| 181 | + uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main |
| 182 | + if: always() |
0 commit comments