[CI/CD] Add triton3.2 ci

zhzhcookie · zhzhcookie · commit a0d024db2196 · 2025-06-05T18:10:35.000+08:00
diff --git a/.github/workflows/ascend-build-and-test.yml b/.github/workflows/ascend-build-and-test.yml
@@ -0,0 +1,30 @@
+name: Ascend-Build-And-Test
+
+on:
+  push:
+    branches: [ "triton_v3.2.x" ]
+  pull_request:
+    branches: [ "triton_v3.2.x" ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  ascend-build-and-test:
+    runs-on: ascend
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: FlagTree Build on Ascend
+        shell: bash
+        run: |
+          source ~/env.sh
+          cd python
+          MAX_JOBS=32 python3.9 -m pip install . --no-build-isolation
+
+      - name: FlagTree Test on Ascend
+        shell: bash
+        run: |
+          python3.9 ../third_party/ascend/test/tutorials/01-vector-add.py
diff --git a/.github/workflows/nv-build-and-test.yml b/.github/workflows/nv-build-and-test.yml
@@ -1,10 +1,12 @@
 name: NV-Build-And-Test
 
 on:
+  schedule:
+    - cron: '0 21 * * *'
   push:
-    branches: [ "main" ]
+    branches: [ "main", "triton_v3.2.x", "triton_v3.3.x" ]
   pull_request:
-    branches: [ "main" ]
+    branches: [ "main", "triton_v3.2.x", "triton_v3.3.x" ]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -17,14 +19,42 @@ jobs:
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: FlagTree Build on NVIDIA-A100
+      - name: Detect Target Branch
+        shell: bash
+        run: |
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            TARGET_BRANCH="${{ github.base_ref }}"
+          else
+            TARGET_BRANCH="${{ github.ref_name }}"
+          fi
+          echo "TARGET_BRANCH=$TARGET_BRANCH" >> $GITHUB_ENV
+          echo "TARGET_BRANCH=$TARGET_BRANCH"
+
+      - name: FlagTree Build (Main branch)
+        if: ${{ env.TARGET_BRANCH == 'main' }}
         shell: bash
         run: |
           source ~/env.sh
           cd python
-          MAX_JOBS=20 pip3.11 install . --no-build-isolation
+          MAX_JOBS=32 pip3.11 install . --no-build-isolation
+
+      - name: FlagTree Build (triton_v3.2.x branch)
+        if: ${{ env.TARGET_BRANCH == 'triton_v3.2.x' }}
+        shell: bash
+        run: |
+          source ~/env-3.2.sh
+          cd python
+          MAX_JOBS=32 pip3.11 install . --no-build-isolation
+
+      - name: FlagTree Build (triton_v3.3.x branch)
+        if: ${{ env.TARGET_BRANCH == 'triton_v3.3.x' }}
+        shell: bash
+        run: |
+          source ~/env-3.3.sh
+          cd python
+          MAX_JOBS=32 pip3.11 install . --no-build-isolation
 
-      - name: FlagTree Test on NVIDIA-A100
+      - name: FlagTree Test
         shell: bash
         run: |
           pytest -s python/test/unit
diff --git a/.github/workflows/wheels_v2.yml b/.github/workflows/wheels_v2.yml
diff --git a/third_party/ascend/python/tutorials/01-vector-add.py b/third_party/ascend/python/tutorials/01-vector-add.py
@@ -0,0 +1,80 @@
+"""
+Vector Addition
+===============
+
+In this tutorial, you will write a simple vector addition using Triton.
+
+In doing so, you will learn about:
+
+* The basic programming model of Triton.
+
+* The `triton.jit` decorator, which is used to define Triton kernels.
+
+* The best practices for validating and benchmarking your custom ops against native reference implementations.
+
+"""
+
+# %%
+# Compute Kernel
+# --------------
+
+import torch
+import torch_npu
+
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def add_kernel(x_ptr,  # *Pointer* to first input vector.
+               y_ptr,  # *Pointer* to second input vector.
+               output_ptr,  # *Pointer* to output vector.
+               n_elements,  # Size of the vector.
+               BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process.
+               # NOTE: `constexpr` so it can be used as a shape value.
+               ):
+    # There are multiple 'programs' processing different data. We identify which program
+    # we are here:
+    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+    # This program will process inputs that are offset from the initial data.
+    # For instance, if you had a vector of length 256 and block_size of 64, the programs
+    # would each access the elements [0:64, 64:128, 128:192, 192:256].
+    # Note that offsets is a list of pointers:
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    # Create a mask to guard memory operations against out-of-bounds accesses.
+    mask = offsets < n_elements
+    # Load x and y from DRAM, masking out any extra elements in case the input is not a
+    # multiple of the block size.
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    output = x + y
+    # Write x + y back to DRAM.
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+
+# %%
+# Let's also declare a helper function to (1) allocate the `z` tensor
+# and (2) enqueue the above kernel with appropriate grid/block sizes:
+
+
+def add(x: torch.Tensor, y: torch.Tensor):
+    output = torch.empty_like(x)
+    n_elements = output.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
+    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+    return output
+
+
+# %%
+# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
+torch.manual_seed(0)
+size = 98432
+x = torch.rand(size, device='npu')
+y = torch.rand(size, device='npu')
+output_torch = x + y
+output_triton = add(x, y)
+print(output_torch)
+print(output_triton)
+print(f'The maximum difference between torch and triton is '
+      f'{torch.max(torch.abs(output_torch - output_triton))}')