adstraw
diff --git a/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests-amd.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/integration-tests-nvidia.yml‎
Lines changed: 7 additions & 6 deletions b/‎.github/workflows/integration-tests-nvidia.yml‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎.github/workflows/llvm-build.yml‎
Lines changed: 2 additions & 6 deletions b/‎.github/workflows/llvm-build.yml‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎.github/workflows/llvm-build/almalinux.Dockerfile‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/llvm-build/almalinux.Dockerfile‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/runner-preparation.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/runner-preparation.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/wheels.yml‎
Lines changed: 6 additions & 6 deletions b/‎.github/workflows/wheels.yml‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎AGENTS.md‎
Lines changed: 10 additions & 0 deletions b/‎AGENTS.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 118 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎MANIFEST.in‎
Lines changed: 3 additions & 0 deletions b/‎MANIFEST.in‎
Lines changed: 3 additions & 0 deletions
@@ -85,7 +85,7 @@ jobs:
             ~/.triton/json
           key: ${{ runner.os }}-${{ runner.arch }}-llvm-${{ steps.cache-key.outputs.llvm }}-nvidia-${{ steps.cache-key.outputs.nvidia }}-json-${{ steps.cache-key.outputs.json }}
       - name: Install dependencies
-        run: apt-get install -y clang lld ccache
+        run: apt-get update && apt-get install -y clang lld ccache
       - name: Inspect cache directories
         run: |
           mkdir -p ~/.triton
 
@@ -9,15 +9,16 @@ on:
 
 jobs:
   integration-tests-nvidia:
-    runs-on: ${{ matrix.runner }}
+    name: integration-tests-nvidia (${{ matrix.config.name }})
+    runs-on: ${{ matrix.config.runs_on }}
     timeout-minutes: 60
     # Let A100 and H100 continue even if GB200 fails, as it's a bit flaky
-    continue-on-error: ${{ matrix.runner[0] == 'nvidia-gb200'}}
+    continue-on-error: ${{ startsWith(matrix.config.runner_type, 'nvidia-gb200') }}
     strategy:
       matrix:
-        runner: ${{ fromJson(inputs.matrix) }}
+        config: ${{ fromJson(inputs.matrix) }}
     env:
-      RUNNER_TYPE: ${{ matrix.runner[0] }}
+      RUNNER_TYPE: ${{ matrix.config.runner_type }}
       TRITON_BUILD_WITH_CCACHE: "true"
       TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
       TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
@@ -69,7 +70,7 @@ jobs:
         run: |
           echo "$HOME/.local/bin" >> $GITHUB_PATH
       - name: Setup Python environment for GB200
-        if: ${{ matrix.runner[0] == 'nvidia-gb200' }}
+        if: ${{ startsWith(matrix.config.runner_type, 'nvidia-gb200') }}
         run: |
           echo "/venv/bin" >> $GITHUB_PATH
           echo "VIRTUAL_ENV=/venv" >> $GITHUB_ENV
@@ -90,7 +91,7 @@ jobs:
       - name: Run python tests on CUDA
         run: make NUM_PROCS=24 test-unit
       - name: Run interpreter tests
-        if: ${{ matrix.runner[0] == 'nvidia-h100' }}
+        if: ${{ matrix.config.runner_type == 'nvidia-h100' }}
         run: make test-interpret
       - name: Run regression tests
         run: make test-regression
 
@@ -103,8 +103,6 @@ jobs:
         sudo apt-get autoremove -y
         sudo apt-get clean
         df -h
-        echo "Removing large directories"
-        df -h
 
     - name: Configure, Build, Test, and Install LLVM (Ubuntu and macOS x64)
       if: matrix.config.arch == 'x64' && (matrix.config.target-os == 'ubuntu' || matrix.config.target-os == 'macos')
@@ -124,7 +122,6 @@ jobs:
         -DLLVM_ENABLE_PROJECTS="mlir;lld"
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
-        -DLLVM_ENABLE_TERMINFO=OFF
         -DLLVM_ENABLE_ZSTD=OFF
         llvm-project/llvm
 
@@ -149,7 +146,6 @@ jobs:
         -DLLVM_ENABLE_DIA_SDK=OFF
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
-        -DLLVM_ENABLE_TERMINFO=OFF
         -DLLVM_ENABLE_ZSTD=OFF
         llvm-project/llvm
 
@@ -214,7 +210,8 @@ jobs:
         -DCMAKE_RANLIB="/usr/bin/aarch64-linux-gnu-ranlib" \
         -DCMAKE_STRIP="/usr/bin/aarch64-linux-gnu-strip" \
         -DCMAKE_SYSROOT=$SYSROOT \
-        -DLLVM_ENABLE_TERMINFO=OFF \
+        -DLLVM_INCLUDE_TESTS=OFF \
+        -DMLIR_INCLUDE_TESTS=OFF \
         llvm-project/llvm
         ninja -C llvm-project/build install
         tar czf "${{ env.llvm_install_dir }}.tar.gz" "${{ env.llvm_install_dir }}"
@@ -240,7 +237,6 @@ jobs:
         -DLLVM_INSTALL_UTILS=ON
         -DLLVM_TARGETS_TO_BUILD="AArch64;NVPTX;AMDGPU"
         -DLLVM_USE_HOST_TOOLS=ON
-        -DLLVM_ENABLE_TERMINFO=OFF
         -DLLVM_ABI_BREAKING_CHECKS=FORCE_OFF
         llvm-project/llvm
 
 
@@ -35,7 +35,6 @@ RUN cmake -GNinja -Bbuild \
   -DLLVM_ENABLE_ASSERTIONS=ON \
   -DMLIR_ENABLE_BINDINGS_PYTHON=OFF \
   -DLLVM_ENABLE_PROJECTS="mlir;lld" \
-  -DLLVM_ENABLE_TERMINFO=OFF \
   -DLLVM_INSTALL_UTILS=ON \
   -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" \
   -DLLVM_ENABLE_ZSTD=OFF \
 
@@ -95,11 +95,11 @@ jobs:
         if: env.enable_integration == 'true'
         run: |
           if [ x"${{ github.repository }}" == x"triton-lang/triton" ]; then
-            echo '::set-output name=matrix-NVIDIA::[["nvidia-a100"], ["nvidia-h100"], ["nvidia-gb200"]]'
+            echo '::set-output name=matrix-NVIDIA::[{"name":"nvidia-a100","runner_type":"nvidia-a100","runs_on":["nvidia-a100"]},{"name":"nvidia-h100","runner_type":"nvidia-h100","runs_on":["nvidia-h100"]},{"name":"nvidia-gb200","runner_type":"nvidia-gb200","runs_on":{"group":"gb200-runner-set"}}]'
             echo '::set-output name=matrix-AMD::[["self-hosted", "gfx90a"], ["amd-gfx942"], ["amd-gfx950"]]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           else
-            echo '::set-output name=matrix-NVIDIA::["ubuntu-latest"]'
+            echo '::set-output name=matrix-NVIDIA::[{"name":"ubuntu-latest","runner_type":"ubuntu-latest","runs_on":"ubuntu-latest"}]'
             echo '::set-output name=matrix-AMD::["ubuntu-latest"]'
             echo '::set-output name=matrix-MACOS::[["macos-latest"]]'
           fi
@@ -12,7 +12,7 @@ permissions: read-all
 jobs:
 
   Build-Wheels:
-    timeout-minutes: 120
+    timeout-minutes: 180
     runs-on: ${{ matrix.config.runs_on }}
 
     strategy:
@@ -99,12 +99,12 @@ jobs:
           path: ./wheelhouse/*.whl
 
       - name: Install Azure CLI
-        if: ${{ steps.check-version.outputs.new_commit == 'true' }}
+        if: ${{ steps.check-version.outputs.new_commit == 'true' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') }}
         run: |
           curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash
 
       - name: Azure login
-        if: ${{ steps.check-version.outputs.new_commit == 'true' }}
+        if: ${{ steps.check-version.outputs.new_commit == 'true' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') }}
         uses: azure/login@v2
         with:
           client-id: ${{ secrets.AZURE_CLIENT_ID }}
@@ -113,20 +113,20 @@ jobs:
 
       - id: generate-token
         name: Generate token
-        if: ${{ steps.check-version.outputs.new_commit == 'true' }}
+        if: ${{ steps.check-version.outputs.new_commit == 'true' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') }}
         run: |
           AZ_TOKEN=$(az account get-access-token --query accessToken)
           echo "::add-mask::$AZ_TOKEN"
           echo "access_token=$AZ_TOKEN" >> "$GITHUB_OUTPUT"
 
       - name: Publish wheels to Azure DevOps
-        if: ${{ steps.check-version.outputs.new_commit == 'true' }}
+        if: ${{ steps.check-version.outputs.new_commit == 'true' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') }}
         run: |
           python3 -m pip install twine
           python3 -m twine upload -r Triton-Nightly -u TritonArtifactsSP -p ${{ steps.generate-token.outputs.access_token }} --config-file utils/nightly.pypirc --non-interactive --verbose wheelhouse/*
 
       - name: Azure Logout
-        if: ${{ steps.check-version.outputs.new_commit == 'true' && (success() || failure()) }}
+        if: ${{ steps.check-version.outputs.new_commit == 'true' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && (success() || failure()) }}
         run: |
           az logout
           az cache purge
 
@@ -70,6 +70,7 @@ ptxas-blackwell
 # Third-party include
 third_party/nvidia/backend/include
 third_party/nvidia/backend/lib/cupti
+third_party/nvidia/backend/lib/cupti-blackwell
 
 # Docs
 docs/_build/
@@ -93,3 +94,5 @@ docs/sg_execution_times.rst
 
 # macOS
 .DS_Store
+
+AGENTS.override.md
@@ -0,0 +1,10 @@
+# Working on Triton
+
+## Build and Testing Guidelines
+- Before running any tests, run `make` in the triton directory to rebuild triton.
+- For compiler changes, add tests in `python/test/` (pytest) or test (lit). Keep GPU-only tests in `python/test/unit/` or `python/test/gluon/`, name them `test_<feature>_<condition>`, and avoid creating new test files unless requested.
+- Run pytest with `-s --tb=short`. Run a single test with `pytest file.py::test_name`.
+- The build dir is given by `BUILD_DIR := $(shell cd python; $(PYTHON) -c 'from build_helpers import get_cmake_dir; print(get_cmake_dir())')`
+- Run lit from the build dir:  `cd BUILD_DIR; ninja triton-opt; lit -v test/<path>.mlir` (example: `lit -v test/TritonNvidiaGPU/tmem_layouts.mlir`).
+- Lit tests can be run locally (no GPU required).
+- Compiler crashes sometimes print an MLIR reproducer (external_resources / mlir_reproducer). Save the full MLIR + {-# ... #-} metadata to `/tmp/<file>.mlir`, then run `triton-opt /tmp/<file>.mlir --run-reproducer` to reproduce locally.
@@ -20,10 +20,126 @@ option(TRITON_BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF)
 option(TRITON_BUILD_PROTON "Build the Triton Proton profiler" ON)
 option(TRITON_BUILD_UT "Build C++ Triton Unit Tests" ON)
 option(TRITON_BUILD_WITH_CCACHE "Build with ccache (if available)" ON)
+option(TRITON_OFFLINE_BUILD "Build without downloading dependencies" OFF)
 option(LLVM_BUILD_SHARED_LIBS
   "Build all libraries as shared libraries instead of static" OFF)
 set(TRITON_CODEGEN_BACKENDS "" CACHE STRING "Enable different codegen backends")
 
+set(TRITON_CACHE_PATH "" CACHE PATH "Path to triton cache")
+set(TRITON_LLVM_SYSTEM_SUFFIX "" CACHE STRING "Path to LLVM system suffix")
+set(LLVM_SYSPATH "" CACHE PATH "Path to system LLVM installation")
+set(JSON_SYSPATH "" CACHE PATH "Path to system nlohmann/json headers")
+set(TRITON_PTXAS_PATH "" CACHE FILEPATH "Path override for ptxas")
+set(TRITON_PTXAS_BLACKWELL_PATH "" CACHE FILEPATH "Path override for ptxas-blackwell")
+set(TRITON_CUOBJDUMP_PATH "" CACHE FILEPATH "Path override for cuobjdump")
+set(TRITON_NVDISASM_PATH "" CACHE FILEPATH "Path override for nvdisasm")
+set(TRITON_CUDACRT_PATH "" CACHE PATH "Path to CUDA CRT headers.")
+set(TRITON_CUDART_PATH "" CACHE PATH "Path to CUDA Runtime headers")
+set(TRITON_CUPTI_INCLUDE_PATH "" CACHE PATH "Path to CUPTI headers")
+set(TRITON_CUPTI_LIB_PATH "" CACHE PATH "Path to CUPTI libraries")
+
+if(NOT TRITON_CACHE_PATH)
+  message(FATAL_ERROR "TRITON_CACHE_PATH must be set or derivable from TRITON_HOME/HOME/USERPROFILE/HOMEPATH.")
+endif()
+
+set(TRITON_BUILD_HELPER_COMMON_ARGS --triton-cache-path "${TRITON_CACHE_PATH}")
+if("${TRITON_OFFLINE_BUILD}")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --triton-offline-build)
+endif()
+if(NOT "${TRITON_LLVM_SYSTEM_SUFFIX}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --triton-llvm-system-suffix "${TRITON_LLVM_SYSTEM_SUFFIX}")
+endif()
+if(NOT "${LLVM_SYSPATH}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --llvm-syspath "${LLVM_SYSPATH}")
+endif()
+if(NOT "${JSON_SYSPATH}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --json-syspath "${JSON_SYSPATH}")
+endif()
+if(NOT "${TRITON_PTXAS_PATH}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --triton-ptxas-path "${TRITON_PTXAS_PATH}")
+endif()
+if(NOT "${TRITON_PTXAS_BLACKWELL_PATH}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --triton-ptxas-blackwell-path "${TRITON_PTXAS_BLACKWELL_PATH}")
+endif()
+if(NOT "${TRITON_CUOBJDUMP_PATH}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --triton-cuobjdump-path "${TRITON_CUOBJDUMP_PATH}")
+endif()
+if(NOT "${TRITON_NVDISASM_PATH}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --triton-nvdisasm-path "${TRITON_NVDISASM_PATH}")
+endif()
+if(NOT "${TRITON_CUDACRT_PATH}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --triton-cudacrt-path "${TRITON_CUDACRT_PATH}")
+endif()
+if(NOT "${TRITON_CUDART_PATH}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --triton-cudart-path "${TRITON_CUDART_PATH}")
+endif()
+if(NOT "${TRITON_CUPTI_INCLUDE_PATH}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --triton-cupti-include-path "${TRITON_CUPTI_INCLUDE_PATH}")
+endif()
+if(NOT "${TRITON_CUPTI_LIB_PATH}" STREQUAL "")
+  list(APPEND TRITON_BUILD_HELPER_COMMON_ARGS --triton-cupti-lib-path "${TRITON_CUPTI_LIB_PATH}")
+endif()
+
+# Resolve third-party package paths in CMake so direct CMake builds don't rely on setup.py.
+set(TRITON_HAS_LLVM_SYSPATH OFF)
+if(NOT "${LLVM_SYSPATH}" STREQUAL "")
+  set(TRITON_HAS_LLVM_SYSPATH ON)
+  if(NOT DEFINED LLVM_INCLUDE_DIRS)
+    set(LLVM_INCLUDE_DIRS "${LLVM_SYSPATH}/include")
+  endif()
+  if(NOT DEFINED LLVM_LIBRARY_DIR)
+    set(LLVM_LIBRARY_DIR "${LLVM_SYSPATH}/lib")
+  endif()
+endif()
+
+if(NOT "${JSON_SYSPATH}" STREQUAL "" AND NOT DEFINED JSON_INCLUDE_DIR)
+  set(JSON_INCLUDE_DIR "${JSON_SYSPATH}/include")
+endif()
+
+# Regenerate configure outputs during `cmake --build` when helper inputs change.
+set_property(
+  DIRECTORY
+  APPEND
+  PROPERTY CMAKE_CONFIGURE_DEPENDS
+    "${CMAKE_CURRENT_SOURCE_DIR}/python/build_helpers.py"
+    "${CMAKE_CURRENT_SOURCE_DIR}/cmake/llvm-hash.txt"
+    "${CMAKE_CURRENT_SOURCE_DIR}/cmake/json-version.txt"
+)
+find_package(Python3 REQUIRED COMPONENTS Interpreter)
+set(TRITON_THIRD_PARTY_CMAKE_VARS_FILE "${CMAKE_CURRENT_BINARY_DIR}/triton-third-party-vars.cmake")
+execute_process(
+  COMMAND
+    ${Python3_EXECUTABLE}
+    "${CMAKE_CURRENT_SOURCE_DIR}/python/build_helpers.py"
+    write_thirdparty_cmake_vars
+    ${TRITON_BUILD_HELPER_COMMON_ARGS}
+    --output
+    "${TRITON_THIRD_PARTY_CMAKE_VARS_FILE}"
+    --packages llvm json
+  WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+  COMMAND_ERROR_IS_FATAL ANY
+)
+include("${TRITON_THIRD_PARTY_CMAKE_VARS_FILE}")
+
+if(TRITON_BUILD_PYTHON_MODULE)
+  set_property(
+    DIRECTORY
+    APPEND
+    PROPERTY CMAKE_CONFIGURE_DEPENDS
+      "${CMAKE_CURRENT_SOURCE_DIR}/python/build_helpers.py"
+      "${CMAKE_CURRENT_SOURCE_DIR}/cmake/nvidia-toolchain-version.json"
+  )
+  find_package(Python3 REQUIRED COMPONENTS Interpreter)
+  execute_process(
+    COMMAND ${Python3_EXECUTABLE}
+            ${CMAKE_CURRENT_SOURCE_DIR}/python/build_helpers.py
+            download_and_copy_dependencies
+            ${TRITON_BUILD_HELPER_COMMON_ARGS}
+    WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+    COMMAND_ERROR_IS_FATAL ANY
+  )
+endif()
+
 if(TRITON_BUILD_WITH_CCACHE)
   find_program(CCACHE_PROGRAM ccache)
   if(CCACHE_PROGRAM)
@@ -237,6 +353,7 @@ if(TRITON_BUILD_PYTHON_MODULE)
     MLIRIndexToLLVM
     MLIRGPUToROCDLTransforms
     MLIRUBToLLVM
+    MLIRPluginsLib
 
     # LLVM
     LLVMPasses
@@ -343,6 +460,7 @@ find_package(Threads REQUIRED)
 add_subdirectory(third_party/f2reduce)
 add_subdirectory(bin)
 add_subdirectory(test)
+add_subdirectory(examples)
 
 if(TRITON_BUILD_UT)
   add_subdirectory(unittest)
 
@@ -14,3 +14,6 @@ include Makefile
 include python/build_helpers.py
 include python/requirements.txt
 include python/test-requirements.txt
+global-exclude __pycache__
+global-exclude __pycache__/*
+global-exclude *.py[cod]