[FRONTEND] Implement ragged TMA descriptor functionality in Gluon

alexsamardzic · alexsamardzic · commit d530f0730dc0 · 2026-02-07T10:24:10.000Z
diff --git a/python/test/unit/tools/test_ragged_tma.py b/python/test/unit/tools/test_ragged_tma.py
@@ -0,0 +1,196 @@
+import torch
+import pytest
+import triton
+import triton.language as tl
+from typing import Optional
+
+from triton._internal_testing import is_hopper_or_newer
+from triton.tools.ragged_tma import (
+    create_ragged_descriptor,
+    create_ragged_descriptor_device_2d,
+    create_ragged_descriptor_device_3d,
+    load_ragged,
+    store_ragged,
+)
+
+
+@triton.jit
+def example_load_store_kernel_host_desc(
+    x_desc, y_desc, x_off, y_off, num_slices, ragged_dim: tl.constexpr, ndim: tl.constexpr
+):
+    if ndim == 2:
+        data = load_ragged(x_desc, x_off, num_slices, [0, 0], ragged_dim)
+        store_ragged(y_desc, y_off, num_slices, [0, 0], data, ragged_dim)
+    else:
+        data = load_ragged(x_desc, x_off, num_slices, [0, 0, 0], ragged_dim)
+        store_ragged(y_desc, y_off, num_slices, [0, 0, 0], data, ragged_dim)
+
+
+@triton.jit
+def example_load_store_kernel_device_desc_2d(
+    x_ptr, y_ptr,
+    x_off, y_off, num_slices,
+    shape_0, shape_1,
+    stride_0, stride_1,
+    block_shape_0: tl.constexpr, block_shape_1: tl.constexpr,
+    ragged_dim: tl.constexpr,
+):
+    x_desc = create_ragged_descriptor_device_2d(
+        x_ptr,
+        shape_0, shape_1,
+        stride_0, stride_1,
+        block_shape_0, block_shape_1,
+        ragged_dim,
+    )
+    y_desc = create_ragged_descriptor_device_2d(
+        y_ptr,
+        shape_0, shape_1,
+        stride_0, stride_1,
+        block_shape_0, block_shape_1,
+        ragged_dim,
+    )
+
+    data = load_ragged(x_desc, x_off, num_slices, [0, 0], ragged_dim)
+    store_ragged(y_desc, y_off, num_slices, [0, 0], data, ragged_dim)
+
+
+@triton.jit
+def example_load_store_kernel_device_desc_3d(
+    x_ptr, y_ptr,
+    x_off, y_off, num_slices,
+    shape_0, shape_1, shape_2,
+    stride_0, stride_1, stride_2,
+    block_shape_0: tl.constexpr, block_shape_1: tl.constexpr, block_shape_2: tl.constexpr,
+    ragged_dim: tl.constexpr,
+):
+    x_desc = create_ragged_descriptor_device_3d(
+        x_ptr,
+        shape_0, shape_1, shape_2,
+        stride_0, stride_1, stride_2,
+        block_shape_0, block_shape_1, block_shape_2,
+        ragged_dim,
+    )
+    y_desc = create_ragged_descriptor_device_3d(
+        y_ptr,
+        shape_0, shape_1, shape_2,
+        stride_0, stride_1, stride_2,
+        block_shape_0, block_shape_1, block_shape_2,
+        ragged_dim,
+    )
+
+    data = load_ragged(x_desc, x_off, num_slices, [0, 0, 0], ragged_dim)
+    store_ragged(y_desc, y_off, num_slices, [0, 0, 0], data, ragged_dim)
+
+
+def _generate_test_params():
+    dtypes = ["float16", "float32"]
+    modes = ["host", "device"]
+
+    params = []
+    for dtype in dtypes:
+        for mode in modes:
+            # 2D tensors: only ragged_dim=0 is valid
+            params.append((dtype, mode, 2, 0))
+            # 3D tensors: ragged_dim=0 and ragged_dim=1 are valid
+            params.append((dtype, mode, 3, 0))
+            params.append((dtype, mode, 3, 1))
+
+    return params
+
+
+@pytest.mark.skipif(not is_hopper_or_newer(), reason="Requires Hopper or newer")
+@pytest.mark.parametrize(
+    "dtype_name,descriptor_mode,ndim,ragged_dim", _generate_test_params()
+)
+def test_ragged_tma(dtype_name, descriptor_mode, ndim, ragged_dim):
+
+    torch_dtype = getattr(torch, dtype_name)
+
+    if ndim == 2:
+        shape = [128, 80]
+        strides = [80, 1]
+        block_shape = [32, 128]
+    else:  # ndim == 3
+        if ragged_dim == 0:
+            shape = [64, 32, 32]
+            strides = [32 * 32, 32, 1]
+            block_shape = [16, 16, 32]
+        else:  # ragged_dim == 1
+            shape = [64, 32, 32]
+            strides = [32 * 32, 32, 1]
+            block_shape = [32, 16, 32]
+
+    src = torch.ones(shape, dtype=torch_dtype, device="cuda")
+    dst = torch.zeros(shape, dtype=torch_dtype, device="cuda")
+
+    num_slices = min(block_shape[ragged_dim] - 1, shape[ragged_dim] // 3)
+    x_off = 0
+    y_off = (shape[ragged_dim] - num_slices) // 2
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        return torch.empty(size, dtype=torch.int8, device="cuda")
+
+    triton.set_allocator(alloc_fn)
+
+    if descriptor_mode == "host":
+        x_desc = create_ragged_descriptor(src, block_shape, ragged_dim)
+        y_desc = create_ragged_descriptor(dst, block_shape, ragged_dim)
+
+        example_load_store_kernel_host_desc[(1,)](
+            x_desc,
+            y_desc,
+            x_off,
+            y_off,
+            num_slices,
+            ragged_dim,
+            ndim,
+        )
+    else:
+        if ndim == 2:
+            example_load_store_kernel_device_desc_2d[(1,)](
+                src,
+                dst,
+                x_off,
+                y_off,
+                num_slices,
+                shape[0], shape[1],
+                strides[0], strides[1],
+                block_shape[0], block_shape[1],
+                ragged_dim,
+            )
+        else:  # ndim == 3
+            example_load_store_kernel_device_desc_3d[(1,)](
+                src,
+                dst,
+                x_off,
+                y_off,
+                num_slices,
+                shape[0], shape[1], shape[2],
+                strides[0], strides[1], strides[2],
+                block_shape[0], block_shape[1], block_shape[2],
+                ragged_dim,
+            )
+
+    if ragged_dim == 0:
+        if ndim == 2:
+            before = dst[:y_off, : block_shape[1]]
+            copied = dst[y_off : y_off + num_slices, : block_shape[1]]
+            after = dst[y_off + num_slices :, : block_shape[1]]
+        else:  # ndim == 3
+            before = dst[:y_off, : block_shape[1], : block_shape[2]]
+            copied = dst[y_off : y_off + num_slices, : block_shape[1], : block_shape[2]]
+            after = dst[y_off + num_slices :, : block_shape[1], : block_shape[2]]
+    else:  # ragged_dim == 1
+        before = dst[: block_shape[0], :y_off, : block_shape[2]]
+        copied = dst[: block_shape[0], y_off : y_off + num_slices, : block_shape[2]]
+        after = dst[: block_shape[0], y_off + num_slices :, : block_shape[2]]
+
+    res0 = torch.all(before == 0.0).item()
+    res1 = torch.all(copied == 1.0).item()
+    res2 = torch.all(after == 0.0).item()
+
+    assert [res0, res1, res2] == [
+        True,
+        True,
+        True,
+    ], f"Failed for {ndim}D {descriptor_mode} mode ragged_dim={ragged_dim}: before={res0}, copied={res1}, after={res2}"
diff --git a/python/triton/experimental/gluon/tools/ragged_tma.py b/python/triton/experimental/gluon/tools/ragged_tma.py
@@ -0,0 +1,78 @@
+from triton.experimental import gluon
+from triton.experimental.gluon import language as ttgl
+from triton.experimental.gluon.language._standard import _import_from_triton
+from triton.experimental.gluon.language.nvidia.hopper import tma
+from triton.experimental.gluon.nvidia.hopper import TensorDescriptor
+
+import triton.tools.ragged_tma as tl_ragged
+
+# fmt: off
+
+def create_ragged_descriptor_host(T, block_shape, layout, ragged_dim=0):
+    triton_desc = tl_ragged.create_ragged_descriptor(T, block_shape, ragged_dim)
+    return TensorDescriptor(
+        triton_desc.base,
+        triton_desc.shape,
+        triton_desc.strides,
+        triton_desc.block_shape,
+        layout,
+        padding=triton_desc.padding
+    )
+
+
+_compute_ragged_descriptor_params_2d = _import_from_triton(tl_ragged._compute_ragged_descriptor_params_2d)
+_compute_ragged_descriptor_params_3d = _import_from_triton(tl_ragged._compute_ragged_descriptor_params_3d)
+
+@gluon.jit
+def create_ragged_descriptor_device_2d(
+    base_ptr,
+    shape_0, shape_1,
+    stride_0, stride_1: ttgl.constexpr,
+    block_shape_0: ttgl.constexpr, block_shape_1: ttgl.constexpr,
+    layout,
+    ragged_dim: ttgl.constexpr
+):
+    shape, stride = _compute_ragged_descriptor_params_2d(
+        shape_0, shape_1,
+        stride_0, stride_1,
+        ragged_dim
+    )
+    return tma.make_tensor_descriptor(
+        base_ptr,
+        shape=shape,
+        strides=[stride[0], stride[1], stride[2], stride_1],
+        block_shape=[1, 1, block_shape_0, block_shape_1],
+        layout=layout,
+    )
+
+
+@gluon.jit
+def create_ragged_descriptor_device_3d(
+    base_ptr,
+    shape_0, shape_1, shape_2,
+    stride_0, stride_1, stride_2: ttgl.constexpr,
+    block_shape_0: ttgl.constexpr, block_shape_1: ttgl.constexpr, block_shape_2: ttgl.constexpr,
+    layout,
+    ragged_dim: ttgl.constexpr
+):
+    shape, stride =  _compute_ragged_descriptor_params_3d(
+        shape_0, shape_1, shape_2,
+        stride_0, stride_1, stride_2,
+        ragged_dim
+    )
+    return tma.make_tensor_descriptor(
+        base_ptr,
+        shape=shape,
+        strides=[stride[0], stride[1], stride[2], stride[3], stride_2],
+        block_shape=[1, 1, block_shape_0, block_shape_1, block_shape_2],
+        layout=layout,
+    )
+
+
+_to_ragged_indices = _import_from_triton(tl_ragged.to_ragged_indices)
+
+
+@gluon.jit
+def to_ragged_coords(slice_off, slice_size, coords, ragged_dim: ttgl.constexpr):
+    c0, c1, c2 = _to_ragged_indices(slice_off, slice_size, coords[ragged_dim])
+    return [c0, c1] + coords[:ragged_dim] + [c2] + coords[ragged_dim + 1:]
diff --git a/python/triton/tools/ragged_tma.py b/python/triton/tools/ragged_tma.py
@@ -45,16 +45,104 @@ def create_ragged_descriptor(T, block_shape, ragged_dim=0):
     return TensorDescriptor(T, tma_shape, tma_stride, box_shape)
 
 
+@triton.jit
+def _compute_ragged_descriptor_params_2d(
+    shape_0, shape_1,
+    stride_0, stride_1: tl.constexpr,
+    ragged_dim: tl.constexpr
+):
+    tl.static_assert(
+        ragged_dim < 1,
+        "Using last dim as ragged dim is not supported"
+    )
+
+    max_int: tl.constexpr = 0x7fff0000
+    billion: tl.constexpr = 0x40000000
+    two_to_34 = tl.to_tensor(2**34)
+    return (
+        [max_int, max_int, billion, shape_1],
+        [two_to_34 - stride_0, stride_0, stride_0, stride_1],
+    )
+
+
+@triton.jit
+def _compute_ragged_descriptor_params_3d(
+    shape_0, shape_1, shape_2,
+    stride_0, stride_1, stride_2: tl.constexpr,
+    ragged_dim: tl.constexpr
+):
+    tl.static_assert(
+        ragged_dim < 2,
+        "Using last dim as ragged dim is not supported"
+    )
+
+    max_int: tl.constexpr = 0x7fff0000
+    billion: tl.constexpr = 0x40000000
+    two_to_34 = tl.to_tensor(2**34)
+    if ragged_dim == 0:
+        return (
+            [max_int, max_int, billion, shape_1, shape_2],
+            [two_to_34 - stride_0, stride_0, stride_0, stride_1, stride_2],
+        )
+    else:
+        return (
+            [max_int, max_int, shape_0, billion, shape_2],
+            [two_to_34 - stride_1, stride_1, stride_0, stride_1, stride_2],
+        )
+
+
+@triton.jit
+def create_ragged_descriptor_device_2d(
+    base_ptr,
+    shape_0, shape_1,
+    stride_0, stride_1: tl.constexpr,
+    block_shape_0: tl.constexpr, block_shape_1: tl.constexpr,
+    ragged_dim: tl.constexpr
+):
+    shape, stride = _compute_ragged_descriptor_params_2d(
+        shape_0, shape_1,
+        stride_0, stride_1,
+        ragged_dim
+    )
+    one: tl.constexpr = 1
+    return tl.make_tensor_descriptor(
+        base_ptr,
+        shape=shape,
+        strides=[stride[0], stride[1], stride[2], stride_1],
+        block_shape=[one, one, block_shape_0, block_shape_1],
+    )
+
+
+@triton.jit
+def create_ragged_descriptor_device_3d(
+    base_ptr,
+    shape_0, shape_1, shape_2,
+    stride_0, stride_1, stride_2: tl.constexpr,
+    block_shape_0: tl.constexpr, block_shape_1: tl.constexpr, block_shape_2: tl.constexpr,
+    ragged_dim: tl.constexpr
+):
+    shape, stride = _compute_ragged_descriptor_params_3d(
+        shape_0, shape_1, shape_2,
+        stride_0, stride_1, stride_2,
+        ragged_dim
+    )
+    one: tl.constexpr = 1
+    return tl.make_tensor_descriptor(
+        base_ptr,
+        shape=shape,
+        strides=[stride[0], stride[1], stride[2], stride[3], stride_2],
+        block_shape=[one, one, block_shape_0, block_shape_1, block_shape_2],
+    )
+
+
 @triton.jit
 def to_ragged_indices(slice_off, slice_size, row):
     """
     Helper function for load_ragged and store_ragged.
     """
-
     billion = 0x40000000  # == 2**30
     x = billion - slice_size + row
     y = slice_off + slice_size
-
     return billion, y, x