From 2dee64ef245893f100fdffe8a64133a83407f2ca Mon Sep 17 00:00:00 2001 From: erweiw Date: Tue, 2 Jun 2026 21:17:58 -0700 Subject: [PATCH 1/3] Bump mlir-air to 9377b0e and triton_shared to e0c5133 Picks up Xilinx/mlir-air#1645 ("reset memtile counter per bucket-shape group"), which restores i8 matmul compilation speed for the Triton-XDNA-generated broadcast pattern. matmul_i8_m128_n64_k64 cold compile drops from ~145s back to ~9s, full sweep from a 1200s timeout back to ~150s. Also pulls in the Path B follow-up (#1609) and Stage C RFC #1567 changes. The Path B work in mlir-air migrated transform ops from !pdl.operation to !transform.any_op, requiring explicit result type annotations on transform.air.linalg_promote and transform.air.fuse_into_containing_op in the fallback transform script used by kernels that do not supply AIR_TRANSFORM_TILING_SCRIPT. triton_shared bumps from c043a85 to e0c5133 (upstream main, "Fix compiler warnings"); the patch is regenerated with no semantic change (line-number drift only in PtrAnalysis.cpp). Co-Authored-By: Claude Opus 4.7 (1M context) --- amd_triton_npu/backend/driver.py | 16 ++++++++-------- third_party/triton_shared | 2 +- third_party/triton_shared.patch | 4 ++-- utils/mlir-air-hash.txt | 4 ++-- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/amd_triton_npu/backend/driver.py b/amd_triton_npu/backend/driver.py index 6d58ebb..af7c543 100644 --- a/amd_triton_npu/backend/driver.py +++ b/amd_triton_npu/backend/driver.py @@ -707,22 +707,22 @@ def _get_transform_ir_string(): transform.named_sequence @__transform_main(%arg1: !transform.any_op {{transform.readonly}}) {{ %mul = transform.structured.match ops{{["linalg.mul"]}} in %arg1 : (!transform.any_op) -> !transform.any_op %mul_1, %loop = transform.air.linalg_tile %mul [{elemwise_tiling_size_l1_m}, {elemwise_tiling_size_l1_n}] - transform.air.linalg_promote %mul_1 {{"operands_to_promote"=[2], "memory_space"="L1"}} - transform.air.linalg_promote %mul_1 {{"operands_to_promote"=[0,1], "memory_space"="L1"}} + transform.air.linalg_promote %mul_1 {{"operands_to_promote"=[2], "memory_space"="L1"}} : (!transform.any_op) -> !transform.any_op + transform.air.linalg_promote %mul_1 {{"operands_to_promote"=[0,1], "memory_space"="L1"}} : (!transform.any_op) -> !transform.any_op %add = transform.structured.match ops{{["linalg.add"]}} in %arg1 : (!transform.any_op) -> !transform.any_op %add_1, %add_loop = transform.air.linalg_tile %add [{elemwise_tiling_size_l1_m}, {elemwise_tiling_size_l1_n}] - transform.air.linalg_promote %add_1 {{"operands_to_promote"=[2], "memory_space"="L1"}} - transform.air.linalg_promote %add_1 {{"operands_to_promote"=[0,1], "memory_space"="L1"}} + transform.air.linalg_promote %add_1 {{"operands_to_promote"=[2], "memory_space"="L1"}} : (!transform.any_op) -> !transform.any_op + transform.air.linalg_promote %add_1 {{"operands_to_promote"=[0,1], "memory_space"="L1"}} : (!transform.any_op) -> !transform.any_op %matmul = transform.structured.match ops{{["linalg.matmul"]}} in %arg1 : (!transform.any_op) -> !transform.any_op %fill = transform.structured.match ops{{["linalg.fill"]}} in %arg1 : (!transform.any_op) -> !transform.any_op %matmul_1, %matmul_loop = transform.air.linalg_tile %matmul [{matmul_tiling_size_l1_m}, {matmul_tiling_size_l1_n}] - %fill_1 = transform.air.fuse_into_containing_op %fill into %matmul_loop - transform.air.linalg_promote %fill_1 {{"operands_to_promote"=[1], "memory_space"="L1"}} - transform.air.linalg_promote %matmul_1 {{"operands_to_promote"=[2], "memory_space"="L1"}} + %fill_1 = transform.air.fuse_into_containing_op %fill into %matmul_loop : (!transform.any_op, !transform.any_op) -> !transform.any_op + transform.air.linalg_promote %fill_1 {{"operands_to_promote"=[1], "memory_space"="L1"}} : (!transform.any_op) -> !transform.any_op + transform.air.linalg_promote %matmul_1 {{"operands_to_promote"=[2], "memory_space"="L1"}} : (!transform.any_op) -> !transform.any_op %matmul_2, %reduction_loop = transform.air.linalg_tile %matmul_1 [0, 0, {matmul_tiling_size_l1_k}] - transform.air.linalg_promote %matmul_2 {{"operands_to_promote"=[0,1], "memory_space"="L1"}} + transform.air.linalg_promote %matmul_2 {{"operands_to_promote"=[0,1], "memory_space"="L1"}} : (!transform.any_op) -> !transform.any_op transform.yield }} }} diff --git a/third_party/triton_shared b/third_party/triton_shared index c043a85..e0c5133 160000 --- a/third_party/triton_shared +++ b/third_party/triton_shared @@ -1 +1 @@ -Subproject commit c043a85a90167283ee910b99d506012c595b70ff +Subproject commit e0c513317d9e7838b00730fd494e3372400e93db diff --git a/third_party/triton_shared.patch b/third_party/triton_shared.patch index b4ea62f..1a78f37 100644 --- a/third_party/triton_shared.patch +++ b/third_party/triton_shared.patch @@ -70,10 +70,10 @@ index a4ac9ef..69f50b3 100644 def get_device_capability(self): return ("cpu", 0) diff --git a/lib/Analysis/PtrAnalysis.cpp b/lib/Analysis/PtrAnalysis.cpp -index 0fe5f84..b387759 100644 +index d2e8e77..35a9d88 100644 --- a/lib/Analysis/PtrAnalysis.cpp +++ b/lib/Analysis/PtrAnalysis.cpp -@@ -1051,7 +1051,7 @@ void PtrAnalysis::visitOperandUnrealizedCast( +@@ -977,7 +977,7 @@ void PtrAnalysis::visitOperandUnrealizedCast( struct ModuloChunkInitArg { Value reinterpretCast = nullptr; // where in the init args is the first chunk placed diff --git a/utils/mlir-air-hash.txt b/utils/mlir-air-hash.txt index bebca4b..54419df 100644 --- a/utils/mlir-air-hash.txt +++ b/utils/mlir-air-hash.txt @@ -1,3 +1,3 @@ -Commit: dfa6d08 -Timestamp: 2026050805 +Commit: 9377b0e +Timestamp: 2026060303 Version: 0.0.1 From 19a9f411813e521d2d74ac3a3e2161dca1f539ec Mon Sep 17 00:00:00 2001 From: erweiw Date: Tue, 2 Jun 2026 21:31:49 -0700 Subject: [PATCH 2/3] CI: switch mlir-aie find-links to latest-wheels-no-rtti-2 The mlir-air 9377b0e wheel pins mlir_aie_no_rtti (the renamed package published to the latest-wheels-no-rtti-2 release channel). The old latest-wheels-no-rtti channel only carries the legacy mlir_aie name and no longer receives new builds, so install resolves to "No matching distribution" without the -2 link. Updates env_setup.sh, env_setup.ps1, build.yml, nightly-wheels.yml, README.md, and pyproject.toml to point at latest-wheels-no-rtti-2. build.yml's pip show / install-dir extraction also switches to the new mlir_aie_no_rtti distribution name. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/build.yml | 6 +++--- .github/workflows/nightly-wheels.yml | 4 ++-- README.md | 8 ++++---- pyproject.toml | 2 +- utils/env_setup.ps1 | 2 +- utils/env_setup.sh | 2 +- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7fd82b9..dbc0ee6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -55,15 +55,15 @@ jobs: echo "mlir-air timestamp: $MLIR_AIR_TIMESTAMP" python3 -m pip install "mlir_air[aie]==$MLIR_AIR_VERSION.$MLIR_AIR_TIMESTAMP+$SHORT_MLIR_AIR_COMMIT_HASH.no.rtti" \ -f https://github.com/Xilinx/mlir-air/releases/expanded_assets/latest-air-wheels-no-rtti \ - -f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti \ + -f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti-2 \ -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly # The [aie] extra requires llvm-aie without a version pin. Force an # upgrade so we always test against the latest nightly wheel. python3 -m pip install --upgrade --force-reinstall llvm-aie -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly python3 -m pip show llvm-aie - python3 -m pip show mlir_aie + python3 -m pip show mlir_aie_no_rtti # Set environmental variable "MLIR_AIE_INSTALL_DIR" - MLIR_AIE_INSTALL_DIR_STR="$(python3 -m pip show mlir_aie | grep ^Location: | awk '{print $2}')/mlir_aie" + MLIR_AIE_INSTALL_DIR_STR="$(python3 -m pip show mlir_aie_no_rtti | grep ^Location: | awk '{print $2}')/mlir_aie" echo "MLIR_AIE_INSTALL_DIR=$MLIR_AIE_INSTALL_DIR_STR" >> $GITHUB_ENV # Update paths in environmental variables echo "${MLIR_AIE_INSTALL_DIR}/bin" >> $GITHUB_PATH diff --git a/.github/workflows/nightly-wheels.yml b/.github/workflows/nightly-wheels.yml index 8172e32..6fbe6d5 100644 --- a/.github/workflows/nightly-wheels.yml +++ b/.github/workflows/nightly-wheels.yml @@ -345,7 +345,7 @@ jobs: ```bash pip install triton-xdna \ --find-links https://github.com/${{ github.repository }}/releases/expanded_assets/latest-wheels \ - --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti \ + --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti-2 \ --find-links https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly \ --find-links https://github.com/Xilinx/mlir-air/releases/expanded_assets/latest-air-wheels-no-rtti ``` @@ -355,7 +355,7 @@ jobs: ```powershell pip install triton-xdna ` --find-links https://github.com/${{ github.repository }}/releases/expanded_assets/latest-wheels ` - --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti ` + --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti-2 ` --find-links https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly ` --find-links https://github.com/Xilinx/mlir-air/releases/expanded_assets/latest-air-wheels-no-rtti ``` diff --git a/README.md b/README.md index 13498aa..5b42465 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ python3 -m pip install --upgrade pip # Install triton-xdna from GitHub Releases pip install triton-xdna \ --find-links https://github.com/amd/Triton-XDNA/releases/expanded_assets/latest-wheels \ - --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti \ + --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti-2 \ --find-links https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly \ --find-links https://github.com/Xilinx/mlir-air/releases/expanded_assets/latest-air-wheels-no-rtti ``` @@ -62,7 +62,7 @@ pip install triton-xdna \ **Note:** To install from a local wheel file: ```bash pip install /path/to/triton_xdna-*.whl \ - --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti \ + --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti-2 \ --find-links https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly \ --find-links https://github.com/Xilinx/mlir-air/releases/expanded_assets/latest-air-wheels-no-rtti ``` @@ -79,7 +79,7 @@ pip install cmake pybind11 nanobind wheel ninja pytest setuptools Cython # Install triton-xdna from source and all dependencies automatically pip install . --no-build-isolation \ - --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti \ + --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti-2 \ --find-links https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly \ --find-links https://github.com/Xilinx/mlir-air/releases/expanded_assets/latest-air-wheels-no-rtti ``` @@ -179,7 +179,7 @@ pip install torch --index-url https://download.pytorch.org/whl/cpu pip install triton-windows pip install "mlir_air[aie]" ` -f https://github.com/Xilinx/mlir-air/releases/expanded_assets/latest-air-wheels-no-rtti ` - -f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti ` + -f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti-2 ` -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly ``` diff --git a/pyproject.toml b/pyproject.toml index b821571..4366200 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ classifiers = [ # Installation command: # pip install triton-xdna \ # --find-links https://github.com/amd/Triton-XDNA/releases/expanded_assets/latest-wheels \ -# --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti \ +# --find-links https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti-2 \ # --find-links https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly \ # --find-links https://github.com/Xilinx/mlir-air/releases/expanded_assets/latest-air-wheels-no-rtti diff --git a/utils/env_setup.ps1 b/utils/env_setup.ps1 index 42448b2..dd3d2b0 100644 --- a/utils/env_setup.ps1 +++ b/utils/env_setup.ps1 @@ -51,7 +51,7 @@ Write-Host "mlir-air timestamp: $MLIR_AIR_TIMESTAMP" python -m pip install "mlir_air[aie]==$MLIR_AIR_VERSION.$MLIR_AIR_TIMESTAMP+$SHORT_AIR_COMMIT.no.rtti" ` -f https://github.com/Xilinx/mlir-air/releases/expanded_assets/latest-air-wheels-no-rtti ` - -f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti ` + -f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels-no-rtti-2 ` -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly # The [aie] extra requires llvm-aie without a version pin. To track the diff --git a/utils/env_setup.sh b/utils/env_setup.sh index 2f010bd..78819f0 100755 --- a/utils/env_setup.sh +++ b/utils/env_setup.sh @@ -18,7 +18,7 @@ MLIR_AIR_TIMESTAMP=$(awk -v kw="Timestamp:" '$0 ~ kw {for (i=1; i Date: Tue, 2 Jun 2026 23:05:23 -0700 Subject: [PATCH 3/3] triton_shared: silence MSVC C4715 in UseInfo::meetUseType The bumped triton_shared (e0c5133) has a switch in UseInfo::meetUseType that covers all UseType enum values and returns from each case. MSVC's flow analysis doesn't recognize this as exhaustive and emits C4715 "not all control paths return a value", which CMake's /WX (warning as error) on the Windows wheel build promotes to a fatal error. GCC doesn't care, so Linux wheels build fine. Append llvm_unreachable after the switch. Folded into third_party/triton_shared.patch. Co-Authored-By: Claude Opus 4.7 (1M context) --- third_party/triton_shared.patch | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/third_party/triton_shared.patch b/third_party/triton_shared.patch index 1a78f37..b02b58e 100644 --- a/third_party/triton_shared.patch +++ b/third_party/triton_shared.patch @@ -69,6 +69,18 @@ index a4ac9ef..69f50b3 100644 def get_device_capability(self): return ("cpu", 0) +diff --git a/include/triton-shared/Analysis/UseAnalysis.h b/include/triton-shared/Analysis/UseAnalysis.h +index d4e6675..98a8034 100644 +--- a/include/triton-shared/Analysis/UseAnalysis.h ++++ b/include/triton-shared/Analysis/UseAnalysis.h +@@ -49,6 +49,7 @@ struct UseInfo : public dataflow::AbstractSparseLattice { + case UseType::MixUse: + return ChangeResult::NoChange; + } ++ llvm_unreachable("unhandled UseType"); + } + + ChangeResult meet(const AbstractSparseLattice &other) override { diff --git a/lib/Analysis/PtrAnalysis.cpp b/lib/Analysis/PtrAnalysis.cpp index d2e8e77..35a9d88 100644 --- a/lib/Analysis/PtrAnalysis.cpp