[AMD][GLUON] host TDM descriptor support for 1D-5D on gfx1250 (triton-lang#8977)

jammm · web-flow · commit 278e956da488 · 2025-12-12T17:43:44.000-08:00
Currently host TDM descriptors only support 2D tiles. This PR supports
1D-5D in host descriptors, bringing to parity with on-device descriptor
creation.

It also attempts to re-use some code across driver and compiler into a
header `TDMCommon.h` for the warp/block distribution calculations.

Also disabled SGPR preload.
diff --git a/python/triton/experimental/gluon/amd/gfx1250.py b/python/triton/experimental/gluon/amd/gfx1250.py
@@ -17,8 +17,7 @@ class TensorDescriptor:
 
     def __post_init__(self):
         ndim = len(self.shape)
-        # TODO: support 1D-5D tensor descriptors
-        assert ndim == 2, f"Expected 2 dimensions but got {ndim} dimensions"
+        assert 1 <= ndim <= 5, f"Expected 1-5 dimensions but got {ndim} dimensions"
         assert len(self.strides) == ndim, f"Expected {ndim} strides but got {len(self.strides)}"
         assert len(self.block_shape) == ndim, \
             f"Expected block_shape to have {ndim} dimensions but got {len(self.strides)}"
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
@@ -394,7 +394,8 @@ def make_llir(src, metadata, options):
         # Hint the compiler that we'd like the firmware to set the kernel arguments
         # to user SGPRs so that the kernel does not need to s_load its arguments
         # from memory.
-        amd.set_all_fn_arg_inreg(fns[0])
+        if options.arch != "gfx1250":
+            amd.set_all_fn_arg_inreg(fns[0])
 
         if knobs.compilation.enable_asan:
             default_libdir = Path(__file__).parent / 'lib'
diff --git a/third_party/amd/backend/driver.c b/third_party/amd/backend/driver.c
@@ -8,6 +8,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+// Include shared TDM utilities
+#include "TDMCommon.h"
+
 typedef struct {
   uint32_t group0_0;
   uint32_t group0_1;
@@ -21,6 +24,14 @@ typedef struct {
   uint32_t group1_5;
   uint32_t group1_6;
   uint32_t group1_7;
+  uint32_t group2_0;
+  uint32_t group2_1;
+  uint32_t group2_2;
+  uint32_t group2_3;
+  uint32_t group3_0;
+  uint32_t group3_1;
+  uint32_t group3_2;
+  uint32_t group3_3;
 } TDMDescriptor;
 
 typedef struct {
@@ -54,36 +65,39 @@ static PyTypeObject PyTDMDescriptorType = {
     .tp_dealloc = (destructor)PyTDMDescriptor_dealloc,
 };
 
-// TODO: Both host-side and device-side TDM descriptor follow the same encoding
-// format. Consider to add a common utility to remove duplicate code.
+// Encodes a TDM descriptor. Supports 1D-5D tensors.
+// Uses the same encoding format as createTDMDescriptor in TDMUtility.cpp.
 static bool encodeTDMDescriptor(TDMDescriptor *desc, int elementBitWidth,
                                 uint32_t *blockSize, int numWarps,
                                 int padInterval, int padAmount, uint32_t *shape,
                                 uint32_t *strides, uint64_t globalAddress,
                                 int rank) {
-  // NYI: TDM > 2D cases
-  if (rank != 2)
+  if (rank < 1 || rank > 5)
     return false;
 
-  // Get warp distribution
-  uint32_t numWarpsDim0 = numWarps;
-  for (; numWarpsDim0 > blockSize[0]; numWarpsDim0 /= 2)
-    ;
-  uint32_t numWarpsDim1 = numWarps / numWarpsDim0;
-  if (!(numWarpsDim0 > 0 && blockSize[1] % numWarpsDim1 == 0))
-    return false;
+  memset(desc, 0, sizeof(TDMDescriptor));
 
-  uint32_t blockSize0 = (blockSize[0] + numWarpsDim0 - 1) / numWarpsDim0;
-  uint32_t blockSize1 = (blockSize[1] + numWarpsDim1 - 1) / numWarpsDim1;
+  // Convert to int64_t for shared function and get adjusted block sizes
+  int64_t blockShape64[5], adjustedBlockSize64[5];
+  for (int i = 0; i < rank; ++i)
+    blockShape64[i] = blockSize[i];
+  tdmGetAdjustedBlockShape(blockShape64, rank, numWarps, adjustedBlockSize64);
+
+  // Convert back to uint32_t
+  uint32_t adjustedBlockSize[5];
+  for (int i = 0; i < rank; ++i)
+    adjustedBlockSize[i] = (uint32_t)adjustedBlockSize64[i];
 
   // group0 (128 bits / 4 dwords) effective bit encoding:
+  // [1:0]:     pred (to be filled later)
+  // [63:32]:   lds address (to be filled later)
   // [120:64]:  global address
   // [127:126]: type - currently always set to 0x2
   desc->group0_2 = (uint32_t)(globalAddress & 0xFFFFFFFF);
-  desc->group0_3 = (uint32_t)((globalAddress >> 32) & 0x01FFFFFF);
-  desc->group0_3 |= (0x1 << 31);
+  desc->group0_3 = (uint32_t)((globalAddress >> 32) & 0x7FFFFFFF) | (0x1 << 31);
 
   // group1 (256 bits / 8 dwords) effective bit encoding:
+  // [15:0]:    multicast mask
   // [17:16]:   data size - log2(element size in bytes)
   // [20]:      enable padding
   // [24:22]:   pad interval - log2(pad interval in dwords) - 1
@@ -92,26 +106,72 @@ static bool encodeTDMDescriptor(TDMDescriptor *desc, int elementBitWidth,
   // [111:80]:  tensor shape dim outer
   // [127:112]: block shape dim inner
   // [143:128]: block shape dim outer
+  // [159:144]: tile_dim2
   // [207:160]: tensor stride dim outer (we only use 32 bits)
+  // [255:208]: tensor stride dim 2 (48 bits)
   int elementSizeInBytes = elementBitWidth / 8;
-  int dataSize = log2(elementSizeInBytes);
-  desc->group1_0 = (dataSize << 16);
+  int dataSize = (int)log2(elementSizeInBytes);
   int dwordSize = 32;
   int padIntervalInDwords = padInterval * elementBitWidth / dwordSize;
   int padAmountInDwords = padAmount * elementBitWidth / dwordSize;
+
+  desc->group1_0 = (dataSize << 16);
   if (padIntervalInDwords > 0 && padAmountInDwords > 0) {
-    int log2PadInterval = log2(padIntervalInDwords);
+    int log2PadInterval = (int)log2(padIntervalInDwords);
     desc->group1_0 |= (1 << 20);
     desc->group1_0 |= ((log2PadInterval - 1) << 22);
     desc->group1_0 |= ((padAmountInDwords - 1) << 25);
   }
-  desc->group1_1 = (shape[1] << 16);
-  desc->group1_2 = (shape[1] >> 16);
-  desc->group1_2 |= (shape[0] << 16);
-  desc->group1_3 = (shape[0] >> 16);
-  desc->group1_3 |= (blockSize1 << 16);
-  desc->group1_4 = (blockSize0 & 0xFFFF);
-  desc->group1_5 = strides[0];
+
+  // Encode tensor shapes (48-bit encoding, indices from end: rank-1 is inner)
+  desc->group1_1 = (shape[rank - 1] << 16);
+  desc->group1_2 = (shape[rank - 1] >> 16);
+
+  if (rank >= 2) {
+    desc->group1_2 |= (shape[rank - 2] << 16);
+    desc->group1_3 = (shape[rank - 2] >> 16);
+  }
+
+  // Block shapes
+  desc->group1_3 |= (adjustedBlockSize[rank - 1] << 16);
+  if (rank >= 2)
+    desc->group1_4 = (adjustedBlockSize[rank - 2] & 0xFFFF);
+  if (rank >= 3)
+    desc->group1_4 |= (adjustedBlockSize[rank - 3] << 16);
+
+  // Strides
+  if (rank >= 2)
+    desc->group1_5 = strides[rank - 2];
+  if (rank >= 3) {
+    desc->group1_6 = (strides[rank - 3] << 16);
+    desc->group1_7 = (strides[rank - 3] >> 16);
+  }
+
+  // group2 (128 bits / 4 dwords) for 3D-5D tensors:
+  // [31:0]:    tensor_dim2 (3rd dimension from end)
+  // [63:32]:   tensor_dim3 (4th dimension from end)
+  // [111:64]:  tensor_dim2_stride (48 bits, we use 32 bits)
+  // [127:112]: tile_dim3
+  if (rank >= 3) {
+    desc->group2_0 = shape[rank - 3];
+    if (rank >= 4) {
+      desc->group2_1 = shape[rank - 4];
+      desc->group2_2 = strides[rank - 4];
+      desc->group2_3 = (adjustedBlockSize[rank - 4] << 16);
+    }
+  }
+
+  // group3 (128 bits / 4 dwords) for 4D-5D tensors:
+  // [47:0]:    tensor_dim3_stride (48 bits, we use 32 bits)
+  // [79:48]:   tensor_dim4 (5th dimension from end)
+  // [95:80]:   tile_dim4
+  // [127:96]:  reserved
+  if (rank == 5) {
+    desc->group3_0 = strides[rank - 5];
+    desc->group3_1 = (shape[rank - 5] << 16);
+    desc->group3_2 = (shape[rank - 5] >> 16);
+    desc->group3_2 |= (adjustedBlockSize[rank - 5] << 16);
+  }
 
   return true;
 }
@@ -388,16 +448,16 @@ static PyObject *createTDMDescriptor(PyObject *self, PyObject *args) {
   PyObject *shapeFast = NULL;
   PyObject *stridesFast = NULL;
 
-  uint32_t blockSizeInt[2];
-  uint32_t shapeInt[2];
-  uint32_t stridesInt[2];
+  uint32_t blockSizeInt[5];
+  uint32_t shapeInt[5];
+  uint32_t stridesInt[5];
 
   blockSizeFast = PySequence_Fast(blockSize, "blockSize must be a sequence");
   if (!blockSizeFast)
     goto cleanup;
   int rank = PySequence_Fast_GET_SIZE(blockSizeFast);
-  if (rank != 2) {
-    PyErr_SetString(PyExc_RuntimeError, "rank must be 2");
+  if (rank == 0 || rank > 5) {
+    PyErr_SetString(PyExc_RuntimeError, "rank must be between 1 and 5");
     goto cleanup;
   }
 
diff --git a/third_party/amd/backend/driver.py b/third_party/amd/backend/driver.py
@@ -368,6 +368,14 @@ def format_of(ty):
   uint32_t group1_5;
   uint32_t group1_6;
   uint32_t group1_7;
+  uint32_t group2_0;
+  uint32_t group2_1;
+  uint32_t group2_2;
+  uint32_t group2_3;
+  uint32_t group3_0;
+  uint32_t group3_1;
+  uint32_t group3_2;
+  uint32_t group3_3;
 }} TDMDescriptor;
 
 typedef struct {{
diff --git a/third_party/amd/backend/include/TDMCommon.h b/third_party/amd/backend/include/TDMCommon.h
@@ -0,0 +1,55 @@
+#ifndef TRITON_THIRD_PARTY_AMD_BACKEND_INCLUDE_TDMCOMMON_H
+#define TRITON_THIRD_PARTY_AMD_BACKEND_INCLUDE_TDMCOMMON_H
+
+//===----------------------------------------------------------------------===//
+// C-compatible TDM utilities shared between host-side (driver.c) and
+// device-side (TDMUtility.cpp) code.
+//
+// This is intentionally kept header-only to avoid introducing
+// dependencies between the compiler and runtime components.
+//===----------------------------------------------------------------------===//
+
+#include <stdint.h>
+
+// Compute warp distribution across dimensions.
+// Distributes warps starting from the first dimension, assigning as many
+// warps as possible without exceeding the block shape.
+static inline void tdmGetWarpDistribution(const int64_t *blockShape,
+                                          int numDims, int numWarps,
+                                          int *warpsOut) {
+  for (int i = 0; i < numDims; ++i)
+    warpsOut[i] = 1;
+
+  int remainingWarps = numWarps;
+  for (int i = 0; i < numDims && remainingWarps > 1; ++i) {
+    while (remainingWarps > 1 && warpsOut[i] * 2 <= blockShape[i]) {
+      warpsOut[i] *= 2;
+      remainingWarps /= 2;
+    }
+  }
+
+  if (remainingWarps > 1)
+    warpsOut[numDims - 1] *= remainingWarps;
+}
+
+// Compute per-warp block sizes after distributing warps.
+// Only adjusts first 2 dimensions; higher dimensions remain unchanged.
+static inline void tdmGetAdjustedBlockShape(const int64_t *blockShape,
+                                            int numDims, int numWarps,
+                                            int64_t *adjustedOut) {
+  int warps[5];
+  tdmGetWarpDistribution(blockShape, numDims, numWarps, warps);
+
+  if (numDims >= 2) {
+    adjustedOut[0] = (blockShape[0] + warps[0] - 1) / warps[0];
+    adjustedOut[1] = (blockShape[1] + warps[1] - 1) / warps[1];
+  } else {
+    adjustedOut[0] = (blockShape[0] + numWarps - 1) / numWarps;
+  }
+
+  // Higher dimensions are not divided by warps
+  for (int i = 2; i < numDims; ++i)
+    adjustedOut[i] = blockShape[i];
+}
+
+#endif // TRITON_THIRD_PARTY_AMD_BACKEND_INCLUDE_TDMCOMMON_H
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TDMUtility.cpp
@@ -3,6 +3,9 @@
 #include "triton/Tools/LayoutUtils.h"
 #include <optional>
 
+// Include shared C-compatible TDM utilities
+#include "../../backend/include/TDMCommon.h"
+
 namespace mlir::LLVM::AMD {
 namespace {
 
@@ -54,30 +57,16 @@ decodeTDMDescriptor(RewriterBase &rewriter, Location loc,
   return {srcPtr, tensorShape, tensorStride};
 }
 
+// C++ wrapper for the shared tdmGetWarpDistribution function
 SmallVector<int> getWarpDistribution(ArrayRef<int64_t> blockShape,
                                      int numWarps) {
-  SmallVector<int> warps(blockShape.size(), 1);
-  int remainingWarps = numWarps;
-
-  // Distribute warps across dimensions, starting from the first dimension
-  for (size_t i = 0; i < blockShape.size() && remainingWarps > 1; ++i) {
-    // Try to assign as many warps as possible to this dimension
-    // without exceeding the block shape
-    while (remainingWarps > 1 && warps[i] * 2 <= blockShape[i]) {
-      warps[i] *= 2;
-      remainingWarps /= 2;
-    }
-  }
-
-  // If there are still remaining warps, assign them to the last dimension
-  // This ensures we use all available warps
-  if (remainingWarps > 1) {
-    warps[blockShape.size() - 1] *= remainingWarps;
-  }
+  int numDims = blockShape.size();
+  SmallVector<int> warps(numDims);
+  tdmGetWarpDistribution(blockShape.data(), numDims, numWarps, warps.data());
 
   // Verify the distribution is valid
   int totalWarps = 1;
-  for (size_t i = 0; i < warps.size(); ++i) {
+  for (int i = 0; i < numDims; ++i) {
     totalWarps *= warps[i];
     assert(blockShape[i] % warps[i] == 0 &&
            "Block shape must be divisible by warp distribution");
diff --git a/third_party/amd/python/test/test_gluon_gfx1250.py b/third_party/amd/python/test/test_gluon_gfx1250.py
@@ -1220,8 +1220,8 @@ def test_runtime_tensor_fill(M, N, BLOCK_M, BLOCK_N, NUM_BUFFERS):
 
 
 @gluon.jit
-def tensor_descriptor_load_store_nd_kernel(out_ptr, a_ptr, shape, strides, BLOCK_SHAPE, out_shape, out_strides,
-                                           SHARED_LAYOUT: ttgl.constexpr):
+def tensor_descriptor_load_store_nd_kernel_device_tdm(out_ptr, a_ptr, shape, strides, BLOCK_SHAPE, out_shape,
+                                                      out_strides, SHARED_LAYOUT: ttgl.constexpr):
     ndim: ttgl.constexpr = len(BLOCK_SHAPE)
     desc = ttgl.amd.gfx1250.tdm.make_tensor_descriptor(base=a_ptr, shape=shape, strides=strides,
                                                        block_shape=BLOCK_SHAPE, layout=SHARED_LAYOUT)
@@ -1238,10 +1238,23 @@ def tensor_descriptor_load_store_nd_kernel(out_ptr, a_ptr, shape, strides, BLOCK
     ttgl.amd.gfx1250.tdm.async_wait(0)
 
 
+@gluon.jit
+def tensor_descriptor_load_store_nd_kernel_host_tdm(out_desc, inp_desc):
+    ndim: ttgl.constexpr = len(inp_desc.block_shape)
+    offs = (0, ) * ndim
+    block_shared = ttgl.allocate_shared_memory(inp_desc.dtype, shape=inp_desc.block_shape, layout=inp_desc.layout)
+    ttgl.amd.gfx1250.tdm.async_load(inp_desc, offs, block_shared)
+    ttgl.amd.gfx1250.tdm.async_wait(0)
+
+    ttgl.amd.gfx1250.tdm.async_store(out_desc, offs, block_shared)
+    ttgl.amd.gfx1250.tdm.async_wait(0)
+
+
 @pytest.mark.parametrize("ndim", [1, 2, 3, 4, 5])
 @pytest.mark.parametrize("INNER_BLOCK", [4, 8, 16, 32, 64, 128])
 @pytest.mark.parametrize("dtype_str", sorted(set(dtypes_with_bfloat16) - {"int64", "uint64", "float64"}))
-def test_tensor_descriptor_load_store_nd(dtype_str, ndim, INNER_BLOCK):
+@pytest.mark.parametrize("TDM_TYPE", ["DEVICE_TDM", "HOST_TDM"])
+def test_tensor_descriptor_load_store_nd(dtype_str, ndim, INNER_BLOCK, TDM_TYPE):
     SHARED_LAYOUT: ttgl.constexpr = ttgl.SwizzledSharedLayout(vec=1, per_phase=1, max_phase=1,
                                                               order=[ndim - 1 - i for i in range(ndim)])
 
@@ -1263,9 +1276,16 @@ def test_tensor_descriptor_load_store_nd(dtype_str, ndim, INNER_BLOCK):
         inp = inp.cuda()
         out = out.cuda()
 
-    constexpr_block_shape = tuple(ttgl.constexpr(v) for v in BLOCK_SHAPE)
-    k = tensor_descriptor_load_store_nd_kernel[(1, )](out, inp, inp.shape, inp.stride(), constexpr_block_shape,
-                                                      out.shape, out.stride(), SHARED_LAYOUT)
+    if TDM_TYPE == "DEVICE_TDM":
+        constexpr_block_shape = tuple(ttgl.constexpr(v) for v in BLOCK_SHAPE)
+        k = tensor_descriptor_load_store_nd_kernel_device_tdm[(1, )](out, inp, inp.shape,
+                                                                     inp.stride(), constexpr_block_shape, out.shape,
+                                                                     out.stride(), SHARED_LAYOUT)
+    else:
+        assert TDM_TYPE == "HOST_TDM"
+        inp_desc = gluon.amd.gfx1250.TensorDescriptor.from_tensor(inp, list(BLOCK_SHAPE), layout=SHARED_LAYOUT)
+        out_desc = gluon.amd.gfx1250.TensorDescriptor.from_tensor(out, list(BLOCK_SHAPE), layout=SHARED_LAYOUT)
+        k = tensor_descriptor_load_store_nd_kernel_host_tdm[(1, )](out_desc, inp_desc)
 
     amdgcn = k.asm["amdgcn"]
     for pattern in ("tensor_load_to_lds", "tensor_store_from_lds", "s_wait_tensorcnt 0x0"):
@@ -1305,8 +1325,9 @@ def test_tensor_descriptor_load_store_invalid_blocksize():
 
     # Expect compilation to fail due to block size exceeding maximum
     try:
-        tensor_descriptor_load_store_nd_kernel[(1, )](out, inp, inp.shape, inp.stride(), constexpr_block_shape,
-                                                      out.shape, out.stride(), SHARED_LAYOUT)
+        tensor_descriptor_load_store_nd_kernel_device_tdm[(1, )](out, inp, inp.shape,
+                                                                 inp.stride(), constexpr_block_shape, out.shape,
+                                                                 out.stride(), SHARED_LAYOUT)
         pytest.fail(
             f"Expected compilation to fail for block size {INNER_BLOCK} (2^17) > 65536 (2^16), but it succeeded")
     except Exception as e: