-
Notifications
You must be signed in to change notification settings - Fork 931
368 lines (328 loc) · 13.2 KB
/
pr-test.yml
File metadata and controls
368 lines (328 loc) · 13.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
# CI workflow using AWS self-hosted runners.
# Runs AOT build tests and GPU unit tests on push/PR to main.
# Uses ci/bash.sh for Docker execution (same as Jenkins).
#
# Permission Control:
# - Push to main: Always runs
# - PR from org members (ci-users team): Runs automatically
# - PR from external contributors: Requires 'run-ci' label
# (added via @flashinfer-bot run command from authorized user)
name: PR Test
on:
push:
branches: [main]
pull_request:
branches: [main]
types: [opened, synchronize, reopened, labeled]
workflow_dispatch:
inputs:
skip_aot:
description: 'Skip AOT build tests'
type: boolean
default: false
skip_gpu:
description: 'Skip GPU tests'
type: boolean
default: false
concurrency:
group: pr-test-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
pull-requests: write
env:
EXECUTOR_NUMBER: "0"
jobs:
# ---------------------------------------------------------------------------
# Gate - Check if PR is authorized to run CI
# ---------------------------------------------------------------------------
gate:
name: Permission Check
runs-on: ubuntu-latest
outputs:
authorized: ${{ steps.check.outputs.authorized }}
steps:
- name: Check authorization
id: check
env:
GH_TOKEN: ${{ secrets.FLASHINFER_GITHUB_TOKEN }}
run: |
# Always allow push to main and workflow_dispatch
if [[ "${{ github.event_name }}" != "pull_request" ]]; then
echo "authorized=true" >> "$GITHUB_OUTPUT"
echo "Not a PR, authorized"
exit 0
fi
# Check if PR has run-ci label
if [[ "${{ contains(github.event.pull_request.labels.*.name, 'run-ci') }}" == "true" ]]; then
echo "authorized=true" >> "$GITHUB_OUTPUT"
echo "PR has run-ci label, authorized"
exit 0
fi
# Check if PR author is a member of ci-users team
AUTHOR="${{ github.event.pull_request.user.login }}"
ORG="${{ github.repository_owner }}"
TEAM="ci-users"
echo "Checking if $AUTHOR is a member of $ORG/$TEAM..."
if [[ -z "$GH_TOKEN" ]]; then
echo "::warning::FLASHINFER_GITHUB_TOKEN not set, falling back to association check"
# Fallback: check if author has write access
ASSOC="${{ github.event.pull_request.author_association }}"
if [[ "$ASSOC" =~ ^(OWNER|MEMBER|COLLABORATOR)$ ]]; then
echo "authorized=true" >> "$GITHUB_OUTPUT"
echo "PR author has $ASSOC access, authorized"
else
echo "authorized=false" >> "$GITHUB_OUTPUT"
echo "PR author is $ASSOC, not authorized"
fi
exit 0
fi
# Check team membership
MEMBERS=$(gh api \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
--paginate \
"/orgs/${ORG}/teams/${TEAM}/members" \
--jq '.[].login' 2>&1) || {
echo "::warning::Failed to get team members: $MEMBERS"
echo "authorized=false" >> "$GITHUB_OUTPUT"
exit 0
}
if echo "$MEMBERS" | grep -qx "$AUTHOR"; then
echo "authorized=true" >> "$GITHUB_OUTPUT"
echo "$AUTHOR is a member of $TEAM, authorized"
else
echo "authorized=false" >> "$GITHUB_OUTPUT"
echo "$AUTHOR is not a member of $TEAM, not authorized"
fi
# ---------------------------------------------------------------------------
# Setup - Read docker tag and check if build should be skipped
# ---------------------------------------------------------------------------
setup:
name: Setup
needs: gate
if: needs.gate.outputs.authorized == 'true'
runs-on: ubuntu-latest
outputs:
docker_tag: ${{ steps.get-tag.outputs.tag }}
skip_build: ${{ steps.check.outputs.skip }}
steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Get Docker Tag
id: get-tag
run: |
TAG=$(grep 'flashinfer/flashinfer-ci-cu129:' ci/docker-tags.yml | cut -d':' -f2 | tr -d ' ')
if [ -z "$TAG" ]; then
echo "::error::Failed to extract Docker tag from ci/docker-tags.yml"
exit 1
fi
echo "tag=$TAG" >> $GITHUB_OUTPUT
echo "Docker tag: $TAG"
- name: Check Skip Conditions
id: check
run: |
if [ "${{ github.event_name }}" != "pull_request" ]; then
echo "skip=false" >> $GITHUB_OUTPUT
exit 0
fi
# Use PR event SHAs for reliable diff (avoids issues with origin refs)
BASE_SHA="${{ github.event.pull_request.base.sha }}"
HEAD_SHA="${{ github.event.pull_request.head.sha }}"
CHANGED=$(git diff --name-only "$BASE_SHA...$HEAD_SHA")
# TODO (yongwww): Add back ^\.github/ before merging to main
SKIP_PATTERNS="README.md|^docs/|^docker/|^licenses/|^LICENSE$|^NOTICE$|^version\.txt$"
SKIP=true
while IFS= read -r file; do
if [ -n "$file" ] && ! echo "$file" | grep -qE "$SKIP_PATTERNS"; then
SKIP=false
break
fi
done <<< "$CHANGED"
echo "skip=$SKIP" >> $GITHUB_OUTPUT
if [ "$SKIP" == "true" ]; then
echo "::notice::Skipping build - only docs/config files changed"
fi
# ---------------------------------------------------------------------------
# AOT Build Import Tests - x86_64 and aarch64 (multiple CUDA versions)
# Uses ci/bash.sh with --no-gpu (same as Jenkins)
# ---------------------------------------------------------------------------
aot-build-import:
name: AOT Build Import (${{ matrix.arch }}, ${{ matrix.cuda }})
needs: [gate, setup]
if: |
needs.gate.outputs.authorized == 'true' &&
needs.setup.outputs.skip_build != 'true' &&
github.event.inputs.skip_aot != 'true'
runs-on:
- self-hosted
- Linux
- ${{ matrix.arch }}
- cpu
timeout-minutes: 360
strategy:
fail-fast: false
matrix:
arch: [X64, ARM64]
cuda: [cu126, cu128, cu129, cu130]
env:
DOCKER_IMAGE: flashinfer/flashinfer-ci-${{ matrix.cuda }}:${{ needs.setup.outputs.docker_tag }}
steps:
- name: Cleanup
run: |
# Stop all Docker containers to free memory
docker stop $(docker ps -q) 2>/dev/null || true
docker rm $(docker ps -aq) 2>/dev/null || true
# Clean workspace and caches
sudo rm -rf ${{ github.workspace }}/* || true
sudo rm -rf ${{ github.workspace }}/.[!.]* || true
rm -rf ~/.cache/flashinfer_jit || true
docker system prune -f || true
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: flashinfer
password: ${{ secrets.DOCKERHUB_TOKEN }}
continue-on-error: true # Don't fail if secret is unavailable (e.g., fork PRs)
- name: Show Node Info
run: ./scripts/task_show_node_info.sh
env:
NODE_NAME: ${{ runner.name }}
WORKSPACE: ${{ github.workspace }}
BUILD_NUMBER: ${{ github.run_number }}
- name: Test JIT Cache Package Build and Import
run: bash ci/bash.sh ${DOCKER_IMAGE} --no-gpu ./scripts/task_test_jit_cache_package_build_import.sh
# ---------------------------------------------------------------------------
# GPU JIT Tests - SM86 (A10G) - 5 Shards
# Uses ci/bash.sh with GPU (same as Jenkins)
# ---------------------------------------------------------------------------
gpu-tests-a10g:
name: JIT Unittest ${{ matrix.shard }} (A10G)
needs: [gate, setup]
if: |
needs.gate.outputs.authorized == 'true' &&
needs.setup.outputs.skip_build != 'true' &&
github.event.inputs.skip_gpu != 'true'
runs-on: [self-hosted, Linux, X64, gpu, sm86]
timeout-minutes: 360
strategy:
fail-fast: false
matrix:
shard: [1, 2, 3, 4, 5]
env:
DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
steps:
- name: Cleanup
run: |
# Stop all Docker containers to free GPU memory
docker stop $(docker ps -q) 2>/dev/null || true
docker rm $(docker ps -aq) 2>/dev/null || true
# Clean workspace and caches
sudo rm -rf ${{ github.workspace }}/* || true
sudo rm -rf ${{ github.workspace }}/.[!.]* || true
rm -rf ~/.cache/flashinfer_jit || true
docker system prune -f || true
nvidia-smi || true
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: flashinfer
password: ${{ secrets.DOCKERHUB_TOKEN }}
continue-on-error: true # Don't fail if secret is unavailable (e.g., fork PRs)
- name: Show Node Info
run: ./scripts/task_show_node_info.sh
env:
NODE_NAME: ${{ runner.name }}
WORKSPACE: ${{ github.workspace }}
BUILD_NUMBER: ${{ github.run_number }}
- name: Run JIT Unittest Part ${{ matrix.shard }}
run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part${{ matrix.shard }}.sh
# ---------------------------------------------------------------------------
# GPU JIT Tests - SM75 (T4) - sampling tests only
# Uses ci/bash.sh with GPU (same as Jenkins)
# ---------------------------------------------------------------------------
gpu-tests-t4:
name: JIT Unittest (T4)
needs: [gate, setup]
if: |
needs.gate.outputs.authorized == 'true' &&
needs.setup.outputs.skip_build != 'true' &&
github.event.inputs.skip_gpu != 'true'
runs-on: [self-hosted, Linux, X64, gpu, sm75]
timeout-minutes: 360
env:
DOCKER_IMAGE: flashinfer/flashinfer-ci-cu129:${{ needs.setup.outputs.docker_tag }}
steps:
- name: Cleanup
run: |
# Stop all Docker containers to free GPU memory
docker stop $(docker ps -q) 2>/dev/null || true
docker rm $(docker ps -aq) 2>/dev/null || true
# Clean workspace and caches
sudo rm -rf ${{ github.workspace }}/* || true
sudo rm -rf ${{ github.workspace }}/.[!.]* || true
rm -rf ~/.cache/flashinfer_jit || true
docker system prune -f || true
nvidia-smi || true
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: flashinfer
password: ${{ secrets.DOCKERHUB_TOKEN }}
continue-on-error: true # Don't fail if secret is unavailable (e.g., fork PRs)
- name: Show Node Info
run: ./scripts/task_show_node_info.sh
env:
NODE_NAME: ${{ runner.name }}
WORKSPACE: ${{ github.workspace }}
BUILD_NUMBER: ${{ github.run_number }}
- name: Run JIT Unittest Part 3 (T4)
run: bash ci/bash.sh ${DOCKER_IMAGE} ./scripts/task_jit_run_tests_part3.sh
# ---------------------------------------------------------------------------
# Test Results Summary
# ---------------------------------------------------------------------------
test-results-summary:
name: Test Results Summary
if: always()
needs: [gate, setup, aot-build-import, gpu-tests-a10g, gpu-tests-t4]
runs-on: ubuntu-latest
steps:
- name: Check Results
run: |
echo "## Test Results Summary" >> $GITHUB_STEP_SUMMARY
# Check if CI was skipped due to permissions
if [ "${{ needs.gate.outputs.authorized }}" != "true" ]; then
echo "CI skipped (pending authorization)" >> $GITHUB_STEP_SUMMARY
echo "A contributor in @flashinfer-ai/ci-users can comment \`@flashinfer-bot run\` to approve." >> $GITHUB_STEP_SUMMARY
exit 0
fi
if [ "${{ needs.setup.outputs.skip_build }}" == "true" ]; then
echo "Build skipped (docs/config only changes)" >> $GITHUB_STEP_SUMMARY
exit 0
fi
AOT="${{ needs.aot-build-import.result }}"
A10G="${{ needs.gpu-tests-a10g.result }}"
T4="${{ needs.gpu-tests-t4.result }}"
SKIP_AOT="${{ github.event.inputs.skip_aot }}"
SKIP_GPU="${{ github.event.inputs.skip_gpu }}"
echo "AOT Build Import: $AOT" >> $GITHUB_STEP_SUMMARY
echo "GPU Tests (A10G): $A10G" >> $GITHUB_STEP_SUMMARY
echo "GPU Tests (T4): $T4" >> $GITHUB_STEP_SUMMARY
# Fail if any required job is not success (unless explicitly skipped)
if { [ "$AOT" != "success" ] && [ "$SKIP_AOT" != "true" ]; } || \
{ [ "$A10G" != "success" ] && [ "$SKIP_GPU" != "true" ]; } || \
{ [ "$T4" != "success" ] && [ "$SKIP_GPU" != "true" ]; }; then
echo "**Tests Failed**" >> $GITHUB_STEP_SUMMARY
exit 1
fi
echo "**Tests Passed**" >> $GITHUB_STEP_SUMMARY