Skip to content

Commit 19d33da

Browse files
committed
Test
1 parent 27181b6 commit 19d33da

File tree

1 file changed

+182
-0
lines changed

1 file changed

+182
-0
lines changed

.github/workflows/test_xpu.yml

Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
name: xpu-test
2+
3+
on:
4+
workflow_dispatch:
5+
pull_request:
6+
7+
permissions:
8+
id-token: write
9+
contents: read
10+
11+
concurrency:
12+
group: xpu_ci_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
13+
cancel-in-progress: true
14+
15+
jobs:
16+
test:
17+
# Don't run on forked repos or empty test matrix
18+
# if: github.repository_owner == 'meta-pytorch'
19+
timeout-minutes: 120
20+
runs-on: pvc_rolling
21+
env:
22+
DOCKER_IMAGE: ghcr.io/pytorch/ci-image:pytorch-linux-noble-xpu-n-py3
23+
steps:
24+
- name: Checkout PyTorch
25+
uses: actions/checkout@v4
26+
with:
27+
repository: pytorch/pytorch
28+
ref: nightly
29+
path: pytorch
30+
fetch-depth: 1
31+
submodules: false
32+
33+
- name: Checkout Torchcomms
34+
uses: actions/checkout@v4
35+
with:
36+
path: torchcomms
37+
38+
- name: Clean all stopped docker containers
39+
if: always()
40+
shell: bash
41+
run: |
42+
# Prune all stopped containers.
43+
# If other runner is pruning on this node, will skip.
44+
nprune=$(ps -ef | grep -c "docker container prune")
45+
if [[ $nprune -eq 1 ]]; then
46+
docker container prune -f
47+
fi
48+
49+
- name: Runner health check xpu-smi
50+
if: always()
51+
shell: bash
52+
run: |
53+
timeout 30 xpu-smi discovery || true
54+
55+
- name: Runner health check GPU count
56+
if: always()
57+
shell: bash
58+
run: |
59+
ngpu=$(timeout 30 clinfo -l | grep -c -E 'Device' || true)
60+
msg="Please file an issue on meta-pytorch/torchcomms reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
61+
if [[ $ngpu -eq 0 ]]; then
62+
echo "Error: Failed to detect any GPUs on the runner"
63+
echo "$msg"
64+
exit 1
65+
fi
66+
67+
- name: Runner diskspace health check
68+
uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
69+
if: always()
70+
71+
- name: Preserve github env variables for use in docker
72+
shell: bash
73+
run: |
74+
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
75+
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
76+
77+
- name: XPU set GPU_FLAG
78+
shell: bash
79+
run: |
80+
# Add render group for container creation.
81+
render_gid=`cat /etc/group | grep render | cut -d: -f3`
82+
echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"
83+
84+
- name: Get docker image
85+
id: get-docker-image
86+
working-directory: pytorch
87+
run: |
88+
DOCKER_BUILD_DIR=".ci/docker"
89+
DOCKER_IMAGE_NAME=${${env.DOCKER_IMAGE}-}$(git rev-parse HEAD:"${DOCKER_BUILD_DIR}")
90+
echo "docker-image=${DOCKER_IMAGE_NAME}" >> "${GITHUB_OUTPUT}"
91+
92+
- name: Pull docker image
93+
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
94+
with:
95+
docker-image: ${{ steps.get-docker-image.outputs.docker-image }}
96+
97+
- name: Test
98+
id: test
99+
env:
100+
TEST_COMMAND: torchcomms/.github/scripts/xpu_test.sh
101+
DOCKER_IMAGE: ${{ steps.get-docker-image.outputs.docker-image }}
102+
PR_NUMBER: ${{ github.event.pull_request.number }}
103+
GITHUB_REPOSITORY: ${{ github.repository }}
104+
GITHUB_WORKFLOW: ${{ github.workflow }}
105+
GITHUB_JOB: ${{ github.job }}
106+
GITHUB_RUN_ID: ${{ github.run_id }}
107+
GITHUB_RUN_NUMBER: ${{ github.run_number }}
108+
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
109+
SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
110+
timeout-minutes: 120
111+
run: |
112+
set -x
113+
114+
# detached container should get cleaned up by teardown_ec2_linux
115+
# Used for GPU_FLAG since that doesn't play nice
116+
# shellcheck disable=SC2086,SC2090
117+
container_name=$(docker run \
118+
${GPU_FLAG:-} \
119+
-e PR_NUMBER \
120+
-e GITHUB_ACTIONS \
121+
-e GITHUB_REPOSITORY \
122+
-e GITHUB_WORKFLOW \
123+
-e GITHUB_JOB \
124+
-e GITHUB_RUN_ID \
125+
-e GITHUB_RUN_NUMBER \
126+
-e GITHUB_RUN_ATTEMPT \
127+
-e JOB_ID \
128+
-e BRANCH \
129+
-e SHA1 \
130+
--ulimit stack=10485760:83886080 \
131+
--ulimit core=0 \
132+
--security-opt seccomp=unconfined \
133+
--cap-add=SYS_PTRACE \
134+
--shm-size="8g" \
135+
--tty \
136+
--detach \
137+
--user jenkins \
138+
--privileged \
139+
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
140+
-w /var/lib/jenkins/workspace \
141+
"${DOCKER_IMAGE}"
142+
)
143+
# save container name for later step
144+
echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
145+
# jenkins user does not have write permission to mounted workspace; work-around by copying within container to jenkins home
146+
docker exec -t "${container_name}" sh -c "bash ${TEST_COMMAND}"
147+
148+
- name: Collect backtraces from coredumps (if any)
149+
if: always()
150+
run: |
151+
# shellcheck disable=SC2156
152+
find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
153+
154+
- name: Stop container before exit
155+
if: always()
156+
run: |
157+
# Workaround for multiple runners on same IDC node
158+
docker stop "${{ env.CONTAINER_NAME }}"
159+
160+
- name: Store Core dumps on GitHub
161+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
162+
if: failure()
163+
with:
164+
name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
165+
retention-days: 14
166+
if-no-files-found: ignore
167+
path: ./**/core.[1-9]*
168+
169+
- name: Teardown XPU
170+
if: always()
171+
shell: bash
172+
run: |
173+
# Prune all stopped containers.
174+
# If other runner is pruning on this node, will skip.
175+
nprune=$(ps -ef | grep -c "docker container prune")
176+
if [[ $nprune -eq 1 ]]; then
177+
docker container prune -f
178+
fi
179+
180+
- name: Runner diskspace health check
181+
uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
182+
if: always()

0 commit comments

Comments
 (0)