[triton] Add tl.cat(can_reorder=False) implementation (#9312)

Mogball · web-flow · commit 4278d7fa2f5d · 2026-01-28T03:10:00.000Z
This resurrects the old PR that replaced the implementation entirely. I
also fixed `tl.cat` to be equivalent in semantics to `torch.cat`
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -14,6 +14,7 @@
 
 import triton
 import triton.language as tl
+from triton.tools.tensor_descriptor import TensorDescriptor
 
 from triton._internal_testing import (
     integral_dtypes,
@@ -1918,27 +1919,59 @@ def kernel(X, Z, TO_TYPE: tl.constexpr, BITCAST: tl.constexpr, SIZE: tl.constexp
 @pytest.mark.interpreter
 @pytest.mark.parametrize("dtype_str, num_warps",
                          [(dtype_str, num_warps) for dtype_str in int_dtypes + float_dtypes for num_warps in [4, 8]])
-def test_cat(dtype_str, num_warps, device):
+@pytest.mark.parametrize("can_reorder", [True, False])
+def test_cat(dtype_str, num_warps, can_reorder, device):
     check_type_supported(dtype_str, device)
 
     @triton.jit
-    def kernel(X, Y, Z, N: tl.constexpr):
+    def kernel(X, Y, Z, N: tl.constexpr, CAN_REORDER: tl.constexpr):
         offs = tl.arange(0, N)
         x = tl.load(X + offs)
         y = tl.load(Y + offs)
-        z = tl.cat(x, y, can_reorder=True)
+        z = tl.cat(x, y, can_reorder=CAN_REORDER)
         tl.store(Z + tl.arange(0, 2 * N), z)
 
     x = torch.arange(0, 128, device=device).to(getattr(torch, dtype_str))
     y = torch.arange(-128, 0, device=device).to(getattr(torch, dtype_str))
-    z_ref = torch.cat([x, y], dim=0).sum()
+    z_ref = torch.cat([x, y], dim=0)
     z = torch.zeros((256, ), dtype=getattr(torch, dtype_str), device=device)
-    kernel[(1, )](x, y, z, N=128, num_warps=num_warps)
-    assert z.sum() == z_ref
+    kernel[(1, )](x, y, z, N=128, num_warps=num_warps, CAN_REORDER=can_reorder)
+    assert z.sum() == z_ref.sum()
+    if not can_reorder:
+        torch.testing.assert_close(z, z_ref, atol=0, rtol=0)
     # check if there's no duplicate value in z
     assert z.unique().size(0) == z.size(0)
 
 
+CAT_ND_SHAPES = ((128, ), (16, 32), (8, 16, 4), (2, 4, 8, 16))
+CAT_ND_CASES = []
+for shape in CAT_ND_SHAPES:
+    for dim in range(len(shape)):
+        CAT_ND_CASES.append(pytest.param(shape, dim, id=f"rank={len(shape)},dim={dim}"))
+
+
+@pytest.mark.parametrize("shape, dim", CAT_ND_CASES)
+def test_cat_nd(shape, dim, device):
+
+    @triton.jit
+    def kernel(x_desc, y_desc, z_desc, dim: tl.constexpr, shape: tl.constexpr):
+        rank: tl.constexpr = len(shape)
+        x = x_desc.load([0] * rank)
+        y = y_desc.load([0] * rank)
+        z = tl.cat(x, y, dim=dim)
+        z_desc.store([0] * rank, z)
+
+    x = torch.rand(shape, device=device)
+    y = torch.rand(shape, device=device)
+    z_ref = torch.cat([x, y], dim=dim)
+    z = torch.empty_like(z_ref)
+    x_desc = TensorDescriptor.from_tensor(x, block_shape=shape)
+    y_desc = TensorDescriptor.from_tensor(y, block_shape=shape)
+    z_desc = TensorDescriptor.from_tensor(z, block_shape=z_ref.shape)
+    kernel[(1, )](x_desc, y_desc, z_desc, dim=dim, shape=shape)
+    torch.testing.assert_close(z, z_ref, atol=0, rtol=0)
+
+
 @pytest.mark.interpreter
 @pytest.mark.parametrize("dtype_str", list(torch_dtypes))
 @pytest.mark.parametrize("constant_field", ["value", "mask"])
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -1795,20 +1795,38 @@ def permute(input, *dims, _semantic=None):
 
 
 @builtin
-def cat(input, other, can_reorder=False, _semantic=None):
+def cat(input, other, can_reorder=False, dim=0, _semantic=None):
     """
     Concatenate the given blocks
 
     :param input: The first input tensor.
     :type input: Tensor
     :param other: The second input tensor.
     :type other: Tensor
-    :param reorder: Compiler hint. If true, the compiler is
+    :param can_reorder: Compiler hint. If true, the compiler is
         allowed to reorder elements while concatenating inputs.  Only use if the
         order does not matter (e.g., result is only used in reduction ops).
-        Current implementation of `cat` supports only can_reorder=True.
-    """
-    return _semantic.cat(input, other, can_reorder)
+    :type can_reorder: bool
+    :param dim: The dimension to concatenate along (used when can_reorder is False).
+    :type dim: int
+    """
+    if can_reorder:
+        return _semantic.cat(input, other, can_reorder)
+
+    rank = len(input.shape)
+    assert rank == len(other.shape), f"tensors must have the same rank, got {rank} and {len(other.shape)}"
+    dim = _wrap_axis(_unwrap_if_constexpr(dim), rank)
+    assert all(input.shape[i] == other.shape[i] for i in builtins.range(rank) if i !=
+               dim), f"tensor dims must match except in the concat dimension {dim}, got {input.shape} and {other.shape}"
+
+    # Join introduces a new minor dim; move it before the concat dim and merge.
+    c = join(input, other, _semantic=_semantic)
+    order = list(builtins.range(rank))
+    order.insert(dim, rank)
+    c = permute(c, order, _semantic=_semantic)
+    new_shape = list(input.shape)
+    new_shape[dim] = input.shape[dim] + other.shape[dim]
+    return reshape(c, new_shape, _semantic=_semantic)
 
 
 @builtin
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -694,7 +694,8 @@ def split(self, a: TensorTy) -> Tuple[TensorTy, TensorTy]:
 
     def permute(self, input: TensorTy, dims: Tuple[int]) -> TensorTy:
         if len(input.shape) != len(dims):
-            raise ValueError("permute dims must have the same length as input shape")
+            raise ValueError(
+                f"permute dims must have the same length as input shape, got {len(input.shape)} and {len(dims)}")
         if sorted(tl._unwrap_if_constexpr(d) for d in dims) != list(range(len(dims))):
             raise ValueError(f"permute dims must be a permutation of 0, 1, ..., n-1, but were {dims}")