Skip to content

Commit a0d024d

Browse files
committed
[CI/CD] Add triton3.2 ci
1 parent fe0d672 commit a0d024d

File tree

4 files changed

+145
-81
lines changed

4 files changed

+145
-81
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: Ascend-Build-And-Test
2+
3+
on:
4+
push:
5+
branches: [ "triton_v3.2.x" ]
6+
pull_request:
7+
branches: [ "triton_v3.2.x" ]
8+
9+
concurrency:
10+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
11+
cancel-in-progress: true
12+
13+
jobs:
14+
ascend-build-and-test:
15+
runs-on: ascend
16+
steps:
17+
- name: Checkout code
18+
uses: actions/checkout@v4
19+
20+
- name: FlagTree Build on Ascend
21+
shell: bash
22+
run: |
23+
source ~/env.sh
24+
cd python
25+
MAX_JOBS=32 python3.9 -m pip install . --no-build-isolation
26+
27+
- name: FlagTree Test on Ascend
28+
shell: bash
29+
run: |
30+
python3.9 ../third_party/ascend/test/tutorials/01-vector-add.py
Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
name: NV-Build-And-Test
22

33
on:
4+
schedule:
5+
- cron: '0 21 * * *'
46
push:
5-
branches: [ "main" ]
7+
branches: [ "main", "triton_v3.2.x", "triton_v3.3.x" ]
68
pull_request:
7-
branches: [ "main" ]
9+
branches: [ "main", "triton_v3.2.x", "triton_v3.3.x" ]
810

911
concurrency:
1012
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -17,14 +19,42 @@ jobs:
1719
- name: Checkout code
1820
uses: actions/checkout@v4
1921

20-
- name: FlagTree Build on NVIDIA-A100
22+
- name: Detect Target Branch
23+
shell: bash
24+
run: |
25+
if [ "${{ github.event_name }}" = "pull_request" ]; then
26+
TARGET_BRANCH="${{ github.base_ref }}"
27+
else
28+
TARGET_BRANCH="${{ github.ref_name }}"
29+
fi
30+
echo "TARGET_BRANCH=$TARGET_BRANCH" >> $GITHUB_ENV
31+
echo "TARGET_BRANCH=$TARGET_BRANCH"
32+
33+
- name: FlagTree Build (Main branch)
34+
if: ${{ env.TARGET_BRANCH == 'main' }}
2135
shell: bash
2236
run: |
2337
source ~/env.sh
2438
cd python
25-
MAX_JOBS=20 pip3.11 install . --no-build-isolation
39+
MAX_JOBS=32 pip3.11 install . --no-build-isolation
40+
41+
- name: FlagTree Build (triton_v3.2.x branch)
42+
if: ${{ env.TARGET_BRANCH == 'triton_v3.2.x' }}
43+
shell: bash
44+
run: |
45+
source ~/env-3.2.sh
46+
cd python
47+
MAX_JOBS=32 pip3.11 install . --no-build-isolation
48+
49+
- name: FlagTree Build (triton_v3.3.x branch)
50+
if: ${{ env.TARGET_BRANCH == 'triton_v3.3.x' }}
51+
shell: bash
52+
run: |
53+
source ~/env-3.3.sh
54+
cd python
55+
MAX_JOBS=32 pip3.11 install . --no-build-isolation
2656
27-
- name: FlagTree Test on NVIDIA-A100
57+
- name: FlagTree Test
2858
shell: bash
2959
run: |
3060
pytest -s python/test/unit

.github/workflows/wheels_v2.yml

Lines changed: 0 additions & 76 deletions
This file was deleted.
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
Vector Addition
3+
===============
4+
5+
In this tutorial, you will write a simple vector addition using Triton.
6+
7+
In doing so, you will learn about:
8+
9+
* The basic programming model of Triton.
10+
11+
* The `triton.jit` decorator, which is used to define Triton kernels.
12+
13+
* The best practices for validating and benchmarking your custom ops against native reference implementations.
14+
15+
"""
16+
17+
# %%
18+
# Compute Kernel
19+
# --------------
20+
21+
import torch
22+
import torch_npu
23+
24+
import triton
25+
import triton.language as tl
26+
27+
28+
@triton.jit
29+
def add_kernel(x_ptr, # *Pointer* to first input vector.
30+
y_ptr, # *Pointer* to second input vector.
31+
output_ptr, # *Pointer* to output vector.
32+
n_elements, # Size of the vector.
33+
BLOCK_SIZE: tl.constexpr, # Number of elements each program should process.
34+
# NOTE: `constexpr` so it can be used as a shape value.
35+
):
36+
# There are multiple 'programs' processing different data. We identify which program
37+
# we are here:
38+
pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0.
39+
# This program will process inputs that are offset from the initial data.
40+
# For instance, if you had a vector of length 256 and block_size of 64, the programs
41+
# would each access the elements [0:64, 64:128, 128:192, 192:256].
42+
# Note that offsets is a list of pointers:
43+
block_start = pid * BLOCK_SIZE
44+
offsets = block_start + tl.arange(0, BLOCK_SIZE)
45+
# Create a mask to guard memory operations against out-of-bounds accesses.
46+
mask = offsets < n_elements
47+
# Load x and y from DRAM, masking out any extra elements in case the input is not a
48+
# multiple of the block size.
49+
x = tl.load(x_ptr + offsets, mask=mask)
50+
y = tl.load(y_ptr + offsets, mask=mask)
51+
output = x + y
52+
# Write x + y back to DRAM.
53+
tl.store(output_ptr + offsets, output, mask=mask)
54+
55+
56+
# %%
57+
# Let's also declare a helper function to (1) allocate the `z` tensor
58+
# and (2) enqueue the above kernel with appropriate grid/block sizes:
59+
60+
61+
def add(x: torch.Tensor, y: torch.Tensor):
62+
output = torch.empty_like(x)
63+
n_elements = output.numel()
64+
grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
65+
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
66+
return output
67+
68+
69+
# %%
70+
# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
71+
torch.manual_seed(0)
72+
size = 98432
73+
x = torch.rand(size, device='npu')
74+
y = torch.rand(size, device='npu')
75+
output_torch = x + y
76+
output_triton = add(x, y)
77+
print(output_torch)
78+
print(output_triton)
79+
print(f'The maximum difference between torch and triton is '
80+
f'{torch.max(torch.abs(output_torch - output_triton))}')

0 commit comments

Comments
 (0)