Skip to content

Commit 37c254c

Browse files
committed
Merge branch 'improve-hopper-gdn-prefill' of github.com:yzh119/flashinfer-dev into improve-hopper-gdn-prefill
2 parents d7499c8 + e5236d7 commit 37c254c

96 files changed

Lines changed: 15347 additions & 6390 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.claude/skills/add-cuda-kernel/SKILL.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@ def gen_scale_module(dtype_in, dtype_out):
234234
- No Jinja template needed for simple operations
235235
- Just copy source files to generation directory
236236
- URI uniquely identifies the module configuration
237+
- **NEVER write to package directories** - see "JIT Directory Rules" in `CLAUDE.md`
237238

238239
### (Optional) Specifying Supported CUDA Architectures
239240

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
# Bot command handler for CI permissions
2+
# Authorized users (ci-users team) can comment to control CI:
3+
# @flashinfer-bot run - Add run-ci label to trigger CI
4+
# @flashinfer-bot rerun - Cancel and rerun all workflows
5+
# @flashinfer-bot rerun failed - Rerun failed and cancelled jobs
6+
# @flashinfer-bot stop - Cancel all in-progress workflows
7+
8+
name: CI Bot Commands
9+
10+
on:
11+
issue_comment:
12+
types: [created]
13+
14+
permissions:
15+
contents: read
16+
pull-requests: write
17+
actions: write
18+
19+
jobs:
20+
handle-command:
21+
# Only run on PR comments mentioning @flashinfer-bot
22+
if: |
23+
github.event.issue.pull_request &&
24+
contains(github.event.comment.body, '@flashinfer-bot')
25+
runs-on: ubuntu-latest
26+
27+
steps:
28+
- name: Check team membership
29+
id: check-permission
30+
env:
31+
GH_TOKEN: ${{ secrets.FLASHINFER_GITHUB_TOKEN }}
32+
ORG: ${{ github.repository_owner }}
33+
TEAM: ci-users
34+
ACTOR: ${{ github.event.comment.user.login }}
35+
run: |
36+
echo "Checking if $ACTOR is a member of $ORG/$TEAM..."
37+
38+
# Verify token is set
39+
if [[ -z "$GH_TOKEN" ]]; then
40+
echo "::error::FLASHINFER_GITHUB_TOKEN secret is not set"
41+
echo "authorized=false" >> "$GITHUB_OUTPUT"
42+
exit 0
43+
fi
44+
45+
# List team members and check if commenter is in the list
46+
MEMBERS=$(gh api \
47+
-H "Accept: application/vnd.github+json" \
48+
-H "X-GitHub-Api-Version: 2022-11-28" \
49+
--paginate \
50+
"/orgs/${ORG}/teams/${TEAM}/members" \
51+
--jq '.[].login' 2>&1) || {
52+
echo "::error::Failed to get team members: $MEMBERS"
53+
echo "authorized=false" >> "$GITHUB_OUTPUT"
54+
exit 0
55+
}
56+
57+
if echo "$MEMBERS" | grep -qx "$ACTOR"; then
58+
echo "$ACTOR is a member of $TEAM"
59+
echo "authorized=true" >> "$GITHUB_OUTPUT"
60+
else
61+
echo "$ACTOR is not a member of $TEAM"
62+
echo "authorized=false" >> "$GITHUB_OUTPUT"
63+
fi
64+
65+
- name: Parse command
66+
id: parse
67+
env:
68+
COMMENT_BODY: ${{ github.event.comment.body }}
69+
run: |
70+
if echo "$COMMENT_BODY" | grep -qi "@flashinfer-bot rerun failed"; then
71+
echo "command=rerun-failed" >> "$GITHUB_OUTPUT"
72+
elif echo "$COMMENT_BODY" | grep -qi "@flashinfer-bot rerun"; then
73+
echo "command=rerun" >> "$GITHUB_OUTPUT"
74+
elif echo "$COMMENT_BODY" | grep -qi "@flashinfer-bot stop"; then
75+
echo "command=stop" >> "$GITHUB_OUTPUT"
76+
elif echo "$COMMENT_BODY" | grep -qi "@flashinfer-bot run"; then
77+
echo "command=run" >> "$GITHUB_OUTPUT"
78+
else
79+
echo "command=unknown" >> "$GITHUB_OUTPUT"
80+
fi
81+
82+
- name: Handle @flashinfer-bot run
83+
if: steps.check-permission.outputs.authorized == 'true' && steps.parse.outputs.command == 'run'
84+
env:
85+
GH_TOKEN: ${{ secrets.FLASHINFER_BOT_TOKEN }}
86+
run: |
87+
echo "Adding run-ci label to PR #${{ github.event.issue.number }}"
88+
89+
# Add run-ci label
90+
gh pr edit ${{ github.event.issue.number }} \
91+
--repo ${{ github.repository }} \
92+
--add-label "run-ci"
93+
94+
# React with thumbs up
95+
gh api \
96+
-X POST \
97+
"/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \
98+
-f content='+1'
99+
100+
echo "Label added successfully"
101+
102+
- name: Handle @flashinfer-bot rerun
103+
if: steps.check-permission.outputs.authorized == 'true' && steps.parse.outputs.command == 'rerun'
104+
env:
105+
GH_TOKEN: ${{ secrets.FLASHINFER_BOT_TOKEN }}
106+
run: |
107+
echo "Rerunning all jobs for PR #${{ github.event.issue.number }}"
108+
109+
# Get PR head SHA
110+
PR_SHA=$(gh pr view ${{ github.event.issue.number }} \
111+
--repo ${{ github.repository }} \
112+
--json headRefOid -q '.headRefOid')
113+
114+
echo "PR HEAD SHA: $PR_SHA"
115+
116+
# Cancel in-progress and queued runs first
117+
echo "Cancelling in-progress runs..."
118+
gh run list \
119+
--repo ${{ github.repository }} \
120+
--commit "$PR_SHA" \
121+
--json databaseId,status -q '.[] | select(.status == "in_progress" or .status == "queued") | .databaseId' | \
122+
while read -r run_id; do
123+
if [ -n "$run_id" ]; then
124+
echo "Cancelling workflow $run_id..."
125+
gh run cancel "$run_id" --repo ${{ github.repository }} || true
126+
fi
127+
done
128+
129+
# Wait for cancellations to complete
130+
sleep 2
131+
132+
# Rerun all workflow runs for this commit
133+
echo "Rerunning all workflows..."
134+
gh run list \
135+
--repo ${{ github.repository }} \
136+
--commit "$PR_SHA" \
137+
--json databaseId -q '.[].databaseId' | \
138+
while read -r run_id; do
139+
if [ -n "$run_id" ]; then
140+
echo "Rerunning workflow $run_id..."
141+
gh run rerun "$run_id" --repo ${{ github.repository }} || true
142+
fi
143+
done
144+
145+
# React with thumbs up
146+
gh api \
147+
-X POST \
148+
"/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \
149+
-f content='+1'
150+
151+
echo "Rerun triggered successfully"
152+
153+
- name: Handle @flashinfer-bot rerun failed
154+
if: steps.check-permission.outputs.authorized == 'true' && steps.parse.outputs.command == 'rerun-failed'
155+
env:
156+
GH_TOKEN: ${{ secrets.FLASHINFER_BOT_TOKEN }}
157+
run: |
158+
echo "Rerunning failed/cancelled jobs for PR #${{ github.event.issue.number }}"
159+
160+
# Get PR head SHA
161+
PR_SHA=$(gh pr view ${{ github.event.issue.number }} \
162+
--repo ${{ github.repository }} \
163+
--json headRefOid -q '.headRefOid')
164+
165+
echo "PR HEAD SHA: $PR_SHA"
166+
167+
# Rerun failed and cancelled workflow runs for this commit
168+
# (cancelled jobs are common with fail-fast when one job fails)
169+
for STATUS in failure cancelled; do
170+
gh run list \
171+
--repo ${{ github.repository }} \
172+
--commit "$PR_SHA" \
173+
--status "$STATUS" \
174+
--json databaseId -q '.[].databaseId' | \
175+
while read -r run_id; do
176+
if [ -n "$run_id" ]; then
177+
echo "Rerunning $STATUS workflow $run_id..."
178+
gh run rerun "$run_id" --repo ${{ github.repository }} --failed || true
179+
fi
180+
done
181+
done
182+
183+
# React with thumbs up
184+
gh api \
185+
-X POST \
186+
"/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \
187+
-f content='+1'
188+
189+
echo "Rerun-failed triggered successfully"
190+
191+
- name: Handle @flashinfer-bot stop
192+
if: steps.check-permission.outputs.authorized == 'true' && steps.parse.outputs.command == 'stop'
193+
env:
194+
GH_TOKEN: ${{ secrets.FLASHINFER_BOT_TOKEN }}
195+
run: |
196+
echo "Stopping all workflows for PR #${{ github.event.issue.number }}"
197+
198+
# Get PR head SHA
199+
PR_SHA=$(gh pr view ${{ github.event.issue.number }} \
200+
--repo ${{ github.repository }} \
201+
--json headRefOid -q '.headRefOid')
202+
203+
echo "PR HEAD SHA: $PR_SHA"
204+
205+
# Cancel all in-progress and queued runs
206+
CANCEL_COUNT=0
207+
gh run list \
208+
--repo ${{ github.repository }} \
209+
--commit "$PR_SHA" \
210+
--json databaseId,status -q '.[] | select(.status == "in_progress" or .status == "queued") | .databaseId' | \
211+
while read -r run_id; do
212+
if [ -n "$run_id" ]; then
213+
echo "Cancelling workflow $run_id..."
214+
gh run cancel "$run_id" --repo ${{ github.repository }} || true
215+
CANCEL_COUNT=$((CANCEL_COUNT + 1))
216+
fi
217+
done
218+
219+
# React with thumbs up
220+
gh api \
221+
-X POST \
222+
"/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \
223+
-f content='+1'
224+
225+
echo "Stop triggered successfully"
226+
227+
- name: Unauthorized user
228+
if: steps.check-permission.outputs.authorized != 'true' && steps.parse.outputs.command != 'unknown'
229+
env:
230+
GH_TOKEN: ${{ secrets.FLASHINFER_BOT_TOKEN }}
231+
run: |
232+
echo "User ${{ github.event.comment.user.login }} is not authorized"
233+
234+
# React with confused emoji
235+
gh api \
236+
-X POST \
237+
"/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \
238+
-f content='confused'

.github/workflows/nightly-release.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ jobs:
127127
cuda: ["12.8", "12.9", "13.0"]
128128
arch: ['x86_64', 'aarch64']
129129

130-
runs-on: [self-hosted, "${{ matrix.arch == 'aarch64' && 'arm64' || matrix.arch }}"]
130+
runs-on: [self-hosted, linux, "${{ matrix.arch == 'aarch64' && 'arm64' || 'x64' }}", cpu, on-demand]
131131

132132
steps:
133133
- name: Display Machine Information
@@ -281,7 +281,7 @@ jobs:
281281
matrix:
282282
cuda: ["12.9", "13.0"]
283283
test-shard: [1, 2, 3, 4, 5]
284-
runs-on: [self-hosted, G5, X64]
284+
runs-on: [self-hosted, linux, x64, gpu, sm86, on-demand]
285285

286286
steps:
287287
- name: Display Machine Information
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Auto-remove run-ci label when new commits are pushed to external PRs
2+
# This ensures maintainers must re-approve after code changes
3+
4+
name: PR Label Cleanup
5+
6+
on:
7+
pull_request:
8+
types: [synchronize] # New commits pushed
9+
10+
permissions:
11+
pull-requests: write
12+
13+
jobs:
14+
remove-label:
15+
# Only run if PR has run-ci label and author is external
16+
if: contains(github.event.pull_request.labels.*.name, 'run-ci')
17+
runs-on: ubuntu-latest
18+
steps:
19+
- name: Check if external contributor
20+
id: check
21+
run: |
22+
ASSOC="${{ github.event.pull_request.author_association }}"
23+
if [[ "$ASSOC" =~ ^(OWNER|MEMBER|COLLABORATOR)$ ]]; then
24+
echo "is_external=false" >> "$GITHUB_OUTPUT"
25+
echo "PR author has $ASSOC access, keeping label"
26+
else
27+
echo "is_external=true" >> "$GITHUB_OUTPUT"
28+
echo "PR author is $ASSOC (external), will remove label"
29+
fi
30+
31+
- name: Remove run-ci label
32+
if: steps.check.outputs.is_external == 'true'
33+
env:
34+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
35+
run: |
36+
echo "Removing run-ci label from PR #${{ github.event.pull_request.number }}"
37+
gh pr edit ${{ github.event.pull_request.number }} \
38+
--repo ${{ github.repository }} \
39+
--remove-label "run-ci"
40+
41+
# Post a comment explaining why
42+
gh pr comment ${{ github.event.pull_request.number }} \
43+
--repo ${{ github.repository }} \
44+
--body "New commits detected. The \`run-ci\` label has been removed for security.
45+
46+
A maintainer can re-approve by commenting \`@flashinfer-bot run\`"

0 commit comments

Comments
 (0)