facebookexperimental
diff --git a/‎third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp‎
Lines changed: 5 additions & 6 deletions b/‎third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv5.cpp‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎third_party/tileir/PerformanceTuningTips.md‎
Lines changed: 3 additions & 3 deletions b/‎third_party/tileir/PerformanceTuningTips.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎third_party/tileir/backend/code_generator.py‎
Lines changed: 3 additions & 4 deletions b/‎third_party/tileir/backend/code_generator.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎third_party/tileir/backend/compiler.py‎
Lines changed: 15 additions & 35 deletions b/‎third_party/tileir/backend/compiler.py‎
Lines changed: 15 additions & 35 deletions
diff --git a/‎third_party/tileir/backend/conf.py‎
Lines changed: 4 additions & 1 deletion b/‎third_party/tileir/backend/conf.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎third_party/tileir/backend/driver.c‎
Lines changed: 6 additions & 9 deletions b/‎third_party/tileir/backend/driver.c‎
Lines changed: 6 additions & 9 deletions
diff --git a/‎third_party/tileir/backend/driver.py‎
Lines changed: 14 additions & 21 deletions b/‎third_party/tileir/backend/driver.py‎
Lines changed: 14 additions & 21 deletions
@@ -667,12 +667,11 @@ LogicalResult convertScaledDot(const LLVMTypeConverter &typeConverter,
                         mxfpInstKind, twoCTAs);
   };
 
-  return convertDotImpl(typeConverter, rewriter, loc, op.getA(), op.getB(),
-                        adaptor.getA(), adaptor.getB(), dTensorTy,
-                        adaptor.getUseD(), adaptor.getPred(),
-                        adaptor.getBarriers(), adaptor.getBarrierPreds(),
-                        twoCTAs, tlx::tlxEnablePairedMMA(op), opKindIsMXFP4,
-                        dot);
+  return convertDotImpl(
+      typeConverter, rewriter, loc, op.getA(), op.getB(), adaptor.getA(),
+      adaptor.getB(), dTensorTy, adaptor.getUseD(), adaptor.getPred(),
+      adaptor.getBarriers(), adaptor.getBarrierPreds(), twoCTAs,
+      tlx::tlxEnablePairedMMA(op), opKindIsMXFP4, dot);
 }
 
 //===----------------------------------------------------------------------===//
 
@@ -14,7 +14,7 @@ The **occupancy** hint accepts an integer N from 1 to 32, indicating that the pr
 
 Unlike the Triton PTX backend, the CUDA Tile IR Backend disables approx and ftz by default. Setting `TILEIR_ENABLE_APPROX=1` and `TILEIR_ENABLE_FTZ=1` can provide performance improvements in certain workloads (with precision degradation within acceptable ranges), such as **`attention`** and its variant kernels.
 
-Note that the TileIR compiler (`tileiras`) shipping in CUDA 13.1 does not automatically optimize `exp.approx -> ex2 + mulf`.  For performance and precision parity with the Triton PTX backend, please explicitly rewrite `expOp` to use `ex2 + mulf` instead. 
+Note that the TileIR compiler (`tileiras`) shipping in CUDA 13.1 does not automatically optimize `exp.approx -> ex2 + mulf`.  For performance and precision parity with the Triton PTX backend, please explicitly rewrite `expOp` to use `ex2 + mulf` instead.
 
 #### opt-level
 
@@ -68,11 +68,11 @@ sudo nvidia-smi -i 0 -pm 1; sudo nvidia-smi -i 0 -pl 1000; sudo nvidia-smi -i 0
 
 ![Fused Attention Backward Benchmark](./fused-attention-bwd.png)
 
-### Persistent Matmul (09-persistent-matmul.py) 
+### Persistent Matmul (09-persistent-matmul.py)
 
 > TFLOPS by Proton
 
-#### NVIDIA PTX backend 
+#### NVIDIA PTX backend
 
 | Kernel Name | K=512 | K=1024 | K=1536 | K=2048 | K=2560 | K=3072 | K=3584 | K=4096 | K=4608 | K=5120 | K=5632 | K=6144 | K=6656 | K=7168 | K=7680 | K=8192 |
 |---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
 
@@ -35,6 +35,7 @@
 
 from triton.backends.tileir.conf import TileIREnvConf
 
+
 def mangle_fn(name, arg_tys, caller_context):
     # doesn't mangle ret type, which must be a function of arg tys
     mangled_args = '_'.join([tileir_mangle_ty(ty) for ty in arg_tys])
@@ -46,20 +47,18 @@ def mangle_fn(name, arg_tys, caller_context):
         ret += caller_context.mangle()
     return ret
 
+
 def tileir_mangle_ty(ty):
     return ty.mangle()
 
 
 def tileir_mangle_fn(name, arg_tys, constants):
     # doesn't mangle ret type, which must be a function of arg tys
     mangled_arg_names = "_".join([tileir_mangle_ty(ty) for ty in arg_tys])
-    mangled_constants = "_".join(
-        [f"{i}c{repr(constants[i])}" for i in sorted(constants)]
-    )
+    mangled_constants = "_".join([f"{i}c{repr(constants[i])}" for i in sorted(constants)])
     mangled_constants = mangled_constants.replace(".", "_d_")
     mangled_constants = mangled_constants.replace("'", "_sq_")
     # [ and ] are not allowed in LLVM identifiers
     mangled_constants = mangled_constants.replace('[', '_').replace(']', '_')
     ret = f'{name}__{mangled_arg_names}__{mangled_constants}'
     return ret
-
@@ -17,6 +17,8 @@
 import subprocess
 import sys
 from pathlib import Path
+
+
 def format_compute_capability(capability: int) -> str:
     """
     Format compute capability for GPU architecture.
@@ -52,14 +54,15 @@ def TemporaryDirectory(suffix=None, prefix=None, dir=None, delete=True):
             if delete:
                 shutil.rmtree(temp_dir)
 
+
 @dataclass(frozen=True)
 class TileIROptions:
     ########################## tileIR core options ##########################
     backend_name: str = 'tileir'
     arch: str = None
     num_ctas: int = 1
     # tileir use num_stages to control the op cost, see <tileir_link>
-    num_stages: int = 3 
+    num_stages: int = 3
     # tileir use opt_level to control the optimization level, see <tileir_link>
     opt_level: int = 3
     # tileir use occupancy to control the register usage, see <tileir_link>
@@ -103,10 +106,10 @@ def enable_ftz(self):
     @property
     def enable_approx(self):
         return TileIREnvConf.enable_approx()
+
     def __post_init__(self):
-        assert (
-            self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0
-        ), "num_warps must be a power of 2"
+        assert (self.num_warps > 0 and (self.num_warps & (self.num_warps - 1)) == 0), "num_warps must be a power of 2"
+
     def hash(self):
         hash_dict = dict(self.__dict__)
         # Get all property values from class __dict__
@@ -115,13 +118,7 @@ def hash(self):
                 hash_dict[name] = getattr(self, name)
         # Exclude num_warps from hash since it doesn't affect compilation output.
         # This enables kernel cache sharing for configs that only differ in num_warps.
-        key = "_".join(
-            [
-                f"{name}-{val}"
-                for name, val in sorted(hash_dict.items())
-                if name != "num_warps"
-            ]
-        )
+        key = "_".join([f"{name}-{val}" for name, val in sorted(hash_dict.items()) if name != "num_warps"])
         return hashlib.sha256(key.encode("utf-8")).hexdigest()
 
 
@@ -130,6 +127,7 @@ def get_tileir_version():
 
 
 class TileIRBackend(BaseBackend):
+
     def get_module_map(self):
         from triton.language.extra.cuda import libdevice
 
@@ -152,14 +150,7 @@ def __init__(self, target: GPUTarget) -> None:
 
     def parse_options(self, opts) -> Any:
         args = {"arch": os.getenv("TRITON_OVERRIDE_ARCH", f"sm{self.target.arch}")}
-        args.update(
-            {
-                k: opts[k]
-                for k in TileIROptions.__dataclass_fields__.keys()
-                if k in opts
-                if opts[k] is not None
-            }
-        )
+        args.update({k: opts[k] for k in TileIROptions.__dataclass_fields__.keys() if k in opts if opts[k] is not None})
         capability = int(self._parse_arch(args["arch"]))
         if "supported_fp8_dtypes" not in args:
             supported_fp8_dtypes = set(TileIROptions.supported_fp8_dtypes)
@@ -288,19 +279,14 @@ def make_tileir(mod, metadata, opt: TileIROptions, capability):
             opt.occupancy,
             metadata["num_stages"],
         )
-        tileir.passes.add_auto_gen_memtoken(
-            pm,
-            opt.enable_autogen_alias_mem_token
-        )
+        tileir.passes.add_auto_gen_memtoken(pm, opt.enable_autogen_alias_mem_token)
         passes.common.add_inliner(pm)
         if opt.enable_fp_fusion:
             tileir.passes.add_fma_fusion(pm)
         tileir.passes.add_strip_debuginfo(pm)
         pm.run(mod, "make_tileir")
         if not tileir.only_contain_legal_dialects(mod):
-            raise RuntimeError(
-                "Triton ttir to tileir ir failed. Some ttir ops cannot be converted to tileir."
-            )
+            raise RuntimeError("Triton ttir to tileir ir failed. Some ttir ops cannot be converted to tileir.")
 
         pattern = r"entry @([a-zA-Z0-9_]*)\("
         match = re.findall(pattern, mod.__str__())
@@ -316,15 +302,9 @@ def make_cubin(mod, metadata, opt: TileIROptions, capability):
     def add_stages(self, stages, options, language):
         assert language == Language.TRITON, "Only TRITON language is supported for now"
         capability = int(self._parse_arch(options.arch))
-        stages["ttir"] = lambda src, metadata: self.make_ttir(
-            src, metadata, options, capability
-        )
-        stages["tileIR"] = lambda src, metadata: self.make_tileir(
-            src, metadata, options, capability
-        )
-        stages["cubin"] = lambda src, metadata: self.make_cubin(
-            src, metadata, options, capability
-        )
+        stages["ttir"] = lambda src, metadata: self.make_ttir(src, metadata, options, capability)
+        stages["tileIR"] = lambda src, metadata: self.make_tileir(src, metadata, options, capability)
+        stages["cubin"] = lambda src, metadata: self.make_cubin(src, metadata, options, capability)
 
     @functools.lru_cache()
     def hash(self):
 
@@ -5,6 +5,7 @@
 
 
 class TileIREnvConf:
+
     @staticmethod
     def enable_approx():
         # Enable approximate calculation, trading off numerical precision for performance gains
@@ -35,7 +36,8 @@ def get_tileiras_path():
             path = os.path.join(cuda_home, "bin", "tileiras")
             if os.path.exists(path):
                 import subprocess
-                version_output = subprocess.check_output([path, "--version"], encoding="utf-8", stderr=subprocess.STDOUT)
+                version_output = subprocess.check_output([path, "--version"], encoding="utf-8",
+                                                         stderr=subprocess.STDOUT)
                 if "release 13.1" in version_output:
                     return path
         from shutil import which
@@ -71,6 +73,7 @@ def get_sm_arch():
     def enable_tma_offset_assert_check():
         return os.getenv("NVT_TMA_OFFSET_CHECK", "0") == "1"
 
+
 @contextmanager
 def set_env_var(var_name, new_value):
     # Save the original value of the environment variable
 
@@ -76,23 +76,21 @@ static PyObject *loadtileIRBinary(PyObject *self, PyObject *args) {
       cuFuncGetAttribute(&n_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, fun));
   CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
       cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun));
-  n_spills /= 4;  // Convert bytes to number of 32-bit registers.
-  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(
-      cuFuncGetAttribute(&static_smem_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun));
+  n_spills /= 4; // Convert bytes to number of 32-bit registers.
+  CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
+      &static_smem_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun));
   CUDA_CHECK_AND_RETURN_NULL_ALLOW_THREADS(cuFuncGetAttribute(
-    &n_max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun));
+      &n_max_threads, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, fun));
 
   Py_END_ALLOW_THREADS;
 
   if (PyErr_Occurred()) {
     return NULL;
   }
-  return Py_BuildValue(
-    "(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs, n_spills, n_max_threads
-  );
+  return Py_BuildValue("(KKiii)", (uint64_t)mod, (uint64_t)fun, n_regs,
+                       n_spills, n_max_threads);
 }
 
-
 static PyMethodDef ModuleMethods[] = {
     {"load_tileir_binary", loadtileIRBinary, METH_VARARGS,
      "Load provided tileir into CUDA driver"},
@@ -114,4 +112,3 @@ PyMODINIT_FUNC PyInit_tileir_utils(void) {
 
   return m;
 }
-
@@ -9,12 +9,7 @@
 import tempfile
 import threading
 import torch
-from triton.backends.nvidia.driver import (
-    library_dirs,
-    include_dirs,
-    libraries,
-    ty_to_cpp
-)
+from triton.backends.nvidia.driver import (library_dirs, include_dirs, libraries, ty_to_cpp)
 
 from triton import knobs
 from triton.runtime.build import compile_module_from_src
@@ -24,13 +19,13 @@
 from triton.backends.tileir.conf import TileIREnvConf
 from triton.tools.tensor_descriptor import TensorDescriptor
 
-
 # ------------------------
 # Utils
 # ------------------------
 
 
 class TileIRUtils(object):
+
     def __new__(cls):
         if not hasattr(cls, "instance"):
             cls.instance = super(TileIRUtils, cls).__new__(cls)
@@ -40,11 +35,11 @@ def __init__(self):
         tile_mod_path = dirname
         nvidia_mod_path = os.path.join(os.path.dirname(dirname), "nvidia")
         tile_mod = compile_module_from_src(
-            Path(os.path.join(tile_mod_path, "driver.c")).read_text(), "tileir_utils", library_dirs(), include_dirs, libraries
-        )
+            Path(os.path.join(tile_mod_path, "driver.c")).read_text(), "tileir_utils", library_dirs(), include_dirs,
+            libraries)
         nvidia_mod = compile_module_from_src(
-            Path(os.path.join(nvidia_mod_path, "driver.c")).read_text(), "cuda_utils", library_dirs(), include_dirs, libraries
-        )
+            Path(os.path.join(nvidia_mod_path, "driver.c")).read_text(), "cuda_utils", library_dirs(), include_dirs,
+            libraries)
         self.init_nvidia_function(nvidia_mod)
         self.init_tileir_function(tile_mod)
 
@@ -61,7 +56,6 @@ def init_nvidia_function(self, mod):
 # Launcher
 # ------------------------
 
-
 dirname = os.path.dirname(__file__)
 
 FLOAT_STORAGE_TYPE = {
@@ -79,12 +73,12 @@ def init_nvidia_function(self, mod):
     "fp64": "pack_fp64",
 }
 
-
 _BASE_ARGS_FORMAT = "iiiKKpOOOO"
 _BASE_ARGS_FORMAT_LEN = len(_BASE_ARGS_FORMAT)
 
 
 def make_launcher(constants, signature):
+
     def _flatten_signature(sig, output):
         # Flatten tuples
         if isinstance(sig, tuple):
@@ -353,7 +347,7 @@ def format_of(ty):
   {"; ".join([f"DevicePtrInfo ptr_info{i} = getPointer(_arg{i}, {i}); if (!ptr_info{i}.valid) return NULL;" if ty[0] == "*" else "" for i, ty in signature.items()])};
   {newline.join(float_storage_decls)}
   Py_BEGIN_ALLOW_THREADS;
-  
+
   _launch(numTilesX, numTilesY, numTilesZ, launch_pdl, (CUstream)_stream, (CUfunction)_function{', ' + ', '.join(internal_args_list) if len(internal_args_list) > 0 else ''});
   Py_END_ALLOW_THREADS;
   if (PyErr_Occurred()) {{
@@ -399,7 +393,6 @@ def format_of(ty):
     return src
 
 
-
 # This function unpacks a tensordesc object into its components:
 # - data pointer
 # - shape dimensions
@@ -418,6 +411,7 @@ def make_tensordesc_arg(arg):
 
 
 def wrap_handle_tensordesc(launcher):
+
     def inner(*args):
         # 9 is the metadata arguments in `args` defined in `make_launcher`
         meta_args = args[:9]
@@ -429,6 +423,7 @@ def inner(*args):
             else:
                 final_args.append(arg)
         return launcher(*meta_args, *final_args)
+
     return inner
 
 
@@ -438,7 +433,7 @@ def __init__(self, src, metadata):
         ids = {"ids_of_const_exprs": src.fn.constexprs if hasattr(src, "fn") else tuple()}
 
         constants = src.constants if hasattr(src, "constants") else dict()
-        arg_idx = lambda x: (src.fn.arg_names.index(x),) if isinstance(x, str) else x
+        arg_idx = lambda x: (src.fn.arg_names.index(x), ) if isinstance(x, str) else x
         constants = {arg_idx(idx): value for idx, value in constants.items()}
         signature = {idx: value for idx, value in src.signature.items()}
         has_tensordesc = any("tensordesc" in value for value in signature.values())
@@ -473,7 +468,6 @@ def __init__(self, src, metadata):
             self.launch = mod.launch
         self.launch_pdl = metadata.launch_pdl
 
-
     def __call__(self, *args, **kwargs):
         # TODO: below if branch is for torch 2.8.0a0+5228986c39.nvinternal commit
         # where constexpr arguments are not passed to the launch function by inductor
@@ -482,13 +476,11 @@ def __call__(self, *args, **kwargs):
         num_launch_args = 9
         num_params = len(args) - num_launch_args
         if num_params < self.ori_signature_len:
-            extra_args = [
-                self.constants[(i,)] for i in range(num_params, self.ori_signature_len)
-            ]
+            extra_args = [self.constants[(i, )] for i in range(num_params, self.ori_signature_len)]
             model_args = args + tuple(extra_args)
         else:
             model_args = args
-        model_args = model_args[:5] + (self.launch_pdl,) + model_args[5:]
+        model_args = model_args[:5] + (self.launch_pdl, ) + model_args[5:]
 
         self.launch(*model_args, **kwargs)
 
@@ -543,4 +535,5 @@ def get_empty_cache_for_benchmark(self):
     def clear_cache(self, cache):
         cache.zero_()
 
+
 GlobalTileIRDriver = TileIRDriver()