Integrating changes from #379

Arech8 · Google-ML-Automation · commit 08949bf849c9 · 2026-03-13T10:38:59.000-07:00
FUTURE_COPYBARA_INTEGRATE_REVIEW=#379 from Arech8:pr1_enable_gluon 48a0ff7 PiperOrigin-RevId: 883218017
diff --git a/README.md b/README.md
@@ -2,7 +2,8 @@
 
 ![PyPI version](https://img.shields.io/pypi/v/jax-triton)
 
-The `jax-triton` repository contains integrations between [JAX](https://github.com/jax-ml/jax) and [Triton](https://github.com/openai/triton).
+The `jax-triton` repository contains integrations between [JAX](https://github.com/jax-ml/jax)
+and [Triton](https://github.com/openai/triton), including support for Gluon dialect.
 
 Documentation can be found [here](https://jax-ml.github.io/jax-triton).
 
@@ -26,7 +27,7 @@ def add_kernel(
     y_ptr,        # are input
     length,       # arguments.
     output_ptr,   # Implicit output argument goes after inputs.
-    block_size: tl.constexpr, # Constexpr params goes the last.
+    block_size: tl.constexpr, # Constexpr params go last.
 ):
   """Adds two vectors output = x + y."""
   pid = tl.program_id(axis=0)
diff --git a/jax_triton/triton_lib.py b/jax_triton/triton_lib.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Module for calling Triton kernels from JAX."""
+"""Module for calling Triton or Triton.Gluon kernels from JAX."""
 
 from __future__ import annotations
 
@@ -46,13 +46,15 @@
 CAN_USE_TRITON = False
 try:
   import triton
-  from triton.compiler import code_generator as code_gen
   from triton.compiler import compiler as tc
   import triton.language as tl
   from triton.runtime import autotuner
   import triton._C.libtriton as _triton
   import triton.backends.nvidia.compiler as cb
 
+  import triton.experimental.gluon._runtime as gl_runtime
+  from triton.experimental.gluon import language as gl
+
   CAN_USE_TRITON = True
 except ModuleNotFoundError:
   pass
@@ -115,7 +117,7 @@ def avals_to_layouts(avals):
 def get_triton_type(obj: Any) -> str:
   if isinstance(obj, (jax.core.ShapedArray, state.AbstractRef)):
     return f"*{_JAX_TO_TRITON_TYPE_MAP[obj.dtype]}"
-  if isinstance(obj, tl.constexpr):
+  if isinstance(obj, (tl.constexpr, gl.constexpr)):
     obj = obj.value
   if isinstance(obj, bool):  # True == isinstance(True, int) !!!
     return "B"
@@ -160,10 +162,8 @@ def aval_size_bytes(aval):
   return np.dtype(aval.dtype).itemsize * aval.size
 
 
-def get_cuda_backend(device, compute_capability):
-  target = cb.GPUTarget("cuda", compute_capability, 32)
-  backend = cb.CUDABackend(target)
-  return backend
+def make_gpu_target_cuda(device, compute_capability):
+  return cb.GPUTarget("cuda", compute_capability, 32)
 
 
 _IS_HIPBackend_PATCHED = False
@@ -199,15 +199,13 @@ def fixed_is_within_2gb(arg):
     hb.HIPBackend.is_within_2gb = fixed_is_within_2gb
 
 
-def get_hip_backend(device, compute_capability):
+def make_gpu_target_hip(device, compute_capability):
   # TODO(Arech): remove _patch_hip_backend() once Triton releases a fix
   _patch_hip_backend()
 
   arch = triton_kernel_call_lib.get_arch_details(device)
   arch = arch.split(":")[0]
-  target = hb.GPUTarget("hip", arch, 64)
-  backend = hb.HIPBackend(target)
-  return backend
+  return hb.GPUTarget("hip", arch, 64)
 
 
 @dataclasses.dataclass
@@ -358,7 +356,7 @@ def compile_ttir_to_hsaco_inplace(
 
 
 def get_or_create_triton_kernel(
-    backend_init_func,
+    make_gpu_target_func,
     platform,
     fn,
     arg_dtypes,
@@ -385,7 +383,8 @@ def get_or_create_triton_kernel(
   if num_ctas > 1 and compute_capability < 90:
     raise ValueError("num_ctas > 1 unsupported before Hopper.")
 
-  backend = backend_init_func(device, compute_capability)
+  gpu_target = make_gpu_target_func(device, compute_capability)
+  backend = triton.compiler.make_backend(gpu_target)
 
   signature = {fn.arg_names[i]: v for i, v in enumerate(arg_dtypes)}
   # TODO(sharadmv,zhangqiaorjc): handle differently aligned pointers
@@ -470,16 +469,15 @@ def get_or_create_triton_kernel(
     backend.load_dialects(context)
     codegen_fns = backend.get_codegen_implementation(options)
 
-    module = code_gen.ast_to_ttir(
-        fn,
-        tc.ASTSource(
-            fn, constexprs=constants, signature=signature, attrs=attrs
-        ),
-        options=options,
-        codegen_fns=codegen_fns,
-        context=context,
-        module_map=backend.get_module_map(),
+    real_ASTSource = (
+      gl_runtime.GluonASTSource
+      if isinstance(fn, gl_runtime.GluonJITFunction)
+      else tc.ASTSource
     )
+    module = real_ASTSource(
+      fn, constexprs=constants, signature=signature, attrs=attrs
+    ).make_ir(gpu_target, options, codegen_fns, backend.get_module_map(), context)
+
     ttir = str(module)
 
     compilation_result = compile_ttir_inplace(
@@ -529,7 +527,7 @@ def get_or_create_triton_kernel(
 
 
 def triton_kernel_call_lowering(
-    backend_init_func,
+    make_gpu_target_func,
     ctx,
     *array_args,
     fn,
@@ -547,16 +545,21 @@ def triton_kernel_call_lowering(
     zeroed_outputs,
     debug,
     serialized_metadata,
-    **metaparams,
+    metaparams: tuple[tuple[str, Any], ...],
 ):
+  # we have to pass metaparams dictionary as a tuple to allow hashing necessary for
+  # lowering via xla_primitive_callable()
+  assert isinstance(metaparams, tuple), "metaparams must be tuple[tuple[str, Any], ...]"
+  metaparams = dict(metaparams)  # wil crash if tuple format is incompatible
+
   kernel_call_name = name
   args = list(ctx.avals_in)
   arg_dtypes = list(map(get_triton_type, ctx.avals_in))
   for idx, dtype, v in scalar_args:
     args.insert(idx, v)
     arg_dtypes.insert(idx, dtype)
   # Extract only the output avals not referenced in the input_output_aliases mapping.
-  assert isinstance(input_output_aliases, tuple)
+  assert isinstance(input_output_aliases, tuple), "input_output_aliases must be a tuple"
   input_output_aliases = dict(input_output_aliases)
   strictly_out_avals = [
     aval
@@ -622,9 +625,9 @@ def prune_configs(configs, named_args, **kwargs):
     configs = updated_configs
     fn = fn.fn
 
-  if not isinstance(fn, triton.JITFunction):
+  if not isinstance(fn, (triton.JITFunction, gl_runtime.GluonJITFunction)):
     raise ValueError(
-        "`kernel` must be a Triton `JITFunction`, `Heuristics` or `Autotuner`."
+        "`kernel` must be a Triton `JITFunction`, `GluonJITFunction`, `Heuristics` or `Autotuner`."
     )
 
   output2input = {v: k for k, v in input_output_aliases.items()}
@@ -664,7 +667,7 @@ def prune_configs(configs, named_args, **kwargs):
   kernel_calls = []
   for params in config_params:
     kernel, specialization_attr = get_or_create_triton_kernel(
-        backend_init_func,
+        make_gpu_target_func,
         ctx.module_context.platforms[0],
         fn,
         arg_dtypes,
@@ -739,13 +742,13 @@ def prune_configs(configs, named_args, **kwargs):
 
 mlir.register_lowering(
     triton_kernel_call_p,
-    functools.partial(triton_kernel_call_lowering, get_cuda_backend),
+    functools.partial(triton_kernel_call_lowering, make_gpu_target_cuda),
     platform="cuda",
 )
 
 mlir.register_lowering(
     triton_kernel_call_p,
-    functools.partial(triton_kernel_call_lowering, get_hip_backend),
+    functools.partial(triton_kernel_call_lowering, make_gpu_target_hip),
     platform="rocm",
 )
 
@@ -791,6 +794,7 @@ def triton_call(
     *args: jax.Array | bool | int | float | np.float32,
     kernel: (
         triton.JITFunction
+        | gl_runtime.GluonJITFunction
         | triton.runtime.Heuristics
         | triton.runtime.Autotuner
     ),
@@ -865,7 +869,8 @@ def add(x: jnp.ndarray, y: jnp.ndarray) -> jnp.ndarray:
   Args:
     *args: Inputs for the Triton kernel.
     kernel: A Triton kernel (e.g. a function decorated with `triton.jit`). All
-      static values should be annotated with `triton.language.constexpr`.
+      static values should be annotated with `triton.language.constexpr` or
+      `triton.experimental.gluon.language.constexpr`.
     out_shape: A `jax.ShapeDtypeStruct` (or something that has `.shape` and
       `.dtype` attributes) or a sequence thereof that specify the output(s) of
       the kernel. Pointers for each of the `jax.ShapeDtypeStruct`s in
@@ -880,14 +885,17 @@ def add(x: jnp.ndarray, y: jnp.ndarray) -> jnp.ndarray:
       indices. Providing a mapping will alias the corresponding buffers.
     zeroed_outputs: A sequence of indices, or a function returning a sequence of
       indices, for outputs that should be zeroed before the kernel is launched.
+      Note that this also supports zeroing input-output (i.e. aliased through
+      `input_output_aliases`) arguments that should be treated as outputs in this
+      argument.
     num_warps: The number of warps used to execute the Triton kernel.
     num_stages: The number of stages emitted by the Triton compiler.
     num_ctas: The size of thread blocks per cluster to be used on GPUs with
       compute capabilities >= 9.0. It must be less or equal to 8.
     debug: Prints out intermediate IRs if True for debugging purposes.
     serialized_metadata: Arbitrary metadata that will be added into the
       serialized kernel call.
-    **metaparams: Additional keyword arguments that will be provided to a `grid`
+    metaparams: A dictionary of arguments that will be provided to a `grid`
       (if it is a function) and to the Triton kernel as `constexpr` arguments.
 
   Returns:
@@ -934,6 +942,6 @@ def add(x: jnp.ndarray, y: jnp.ndarray) -> jnp.ndarray:
       zeroed_outputs=zeroed_outputs,
       debug=debug,
       serialized_metadata=serialized_metadata,
-      **metaparams,
+      metaparams=tuple(metaparams.items()),
   )
   return tree_util.tree_unflatten(out_tree, out_flat)
diff --git a/jax_triton/version.py b/jax_triton/version.py
@@ -12,5 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version_info__ = (0, 3, 1)
+__version_info__ = (0, 4, 0)
 __version__ = ".".join(str(v) for v in __version_info__)
diff --git a/tests/gluon_test.py b/tests/gluon_test.py