Skip to content

Commit 0d1c55a

Browse files
committed
[fix bugs]
1 parent f0a7196 commit 0d1c55a

File tree

4 files changed

+83
-3
lines changed

4 files changed

+83
-3
lines changed

.github/workflows/ascend-build-and-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,4 @@ jobs:
2929
shell: bash
3030
run: |
3131
source /usr/local/Ascend/ascend-toolkit/set_env.sh
32-
python3.9 third_party/ascend/examples/tutorials/01-vector-add.py
32+
python3.9 third_party/tests/ascend/vector-add.py

.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ ptxas
5858
third_party/nvidia/backend/include
5959
third_party/nvidia/backend/lib/cupti
6060

61-
6261
# Docs
6362
docs/_build/
6463
docs/python-api/generated/

python/setup_tools/setup_helper.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,12 +276,13 @@ class CommonUtils:
276276

277277
@staticmethod
278278
def unlink():
279-
cur_path = os.path.dirname(__file__)
279+
cur_path = dir_rollback(2, __file__)
280280
if "editable_wheel" in sys.argv:
281281
installation_dir = cur_path
282282
else:
283283
installation_dir = get_python_lib()
284284
backends_dir_path = Path(installation_dir) / "triton" / "backends"
285+
# raise RuntimeError(backends_dir_path)
285286
if not os.path.exists(backends_dir_path):
286287
return
287288
for name in os.listdir(backends_dir_path):
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
"""
2+
Vector Addition
3+
===============
4+
5+
In this tutorial, you will write a simple vector addition using Triton.
6+
7+
In doing so, you will learn about:
8+
9+
* The basic programming model of Triton.
10+
11+
* The `triton.jit` decorator, which is used to define Triton kernels.
12+
13+
* The best practices for validating and benchmarking your custom ops against native reference implementations.
14+
15+
"""
16+
17+
# %%
18+
# Compute Kernel
19+
# --------------
20+
21+
import torch
22+
import torch_npu
23+
24+
import triton
25+
import triton.language as tl
26+
27+
28+
@triton.jit
29+
def add_kernel(x_ptr, # *Pointer* to first input vector.
30+
y_ptr, # *Pointer* to second input vector.
31+
output_ptr, # *Pointer* to output vector.
32+
n_elements, # Size of the vector.
33+
BLOCK_SIZE: tl.constexpr, # Number of elements each program should process.
34+
# NOTE: `constexpr` so it can be used as a shape value.
35+
):
36+
# There are multiple 'programs' processing different data. We identify which program
37+
# we are here:
38+
pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0.
39+
# This program will process inputs that are offset from the initial data.
40+
# For instance, if you had a vector of length 256 and block_size of 64, the programs
41+
# would each access the elements [0:64, 64:128, 128:192, 192:256].
42+
# Note that offsets is a list of pointers:
43+
block_start = pid * BLOCK_SIZE
44+
offsets = block_start + tl.arange(0, BLOCK_SIZE)
45+
# Create a mask to guard memory operations against out-of-bounds accesses.
46+
mask = offsets < n_elements
47+
# Load x and y from DRAM, masking out any extra elements in case the input is not a
48+
# multiple of the block size.
49+
x = tl.load(x_ptr + offsets, mask=mask)
50+
y = tl.load(y_ptr + offsets, mask=mask)
51+
output = x + y
52+
# Write x + y back to DRAM.
53+
tl.store(output_ptr + offsets, output, mask=mask)
54+
55+
56+
# %%
57+
# Let's also declare a helper function to (1) allocate the `z` tensor
58+
# and (2) enqueue the above kernel with appropriate grid/block sizes:
59+
60+
61+
def add(x: torch.Tensor, y: torch.Tensor):
62+
output = torch.empty_like(x)
63+
n_elements = output.numel()
64+
grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
65+
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
66+
return output
67+
68+
69+
# %%
70+
# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
71+
torch.manual_seed(0)
72+
size = 98432
73+
x = torch.rand(size, device='npu')
74+
y = torch.rand(size, device='npu')
75+
output_torch = x + y
76+
output_triton = add(x, y)
77+
print(output_torch)
78+
print(output_triton)
79+
print(f'The maximum difference between torch and triton is '
80+
f'{torch.max(torch.abs(output_torch - output_triton))}')

0 commit comments

Comments
 (0)