TL/CUDA: Add int datatype support for NVLS

Juee14Desai · Juee14Desai · commit d94970742147 · 2026-03-11T06:32:33.000+02:00
This PR adds support for the following additional integer data types in NVLS (NVLink SHARP) collective operations allreduce and reduce_scatter:

- INT32 (s32): 32-bit signed integer

- INT64 (s64): 64-bit signed integer

- UINT32 (u32): 32-bit unsigned integer

- UINT64 (u64): 64-bit unsigned integer

Added PTX multimem.ld_reduce and multimem.st instructions for each type

Created NvlsOps structs for type-specific operations

Updated allreduce and reduce_scatter kernels with new data type handling

Modified validation logic to accept new data types

Signed-off-by: Juee14Desai &lt;jueehimalbha@nvidia.com&gt;
diff --git a/src/components/tl/cuda/allreduce/allreduce.h b/src/components/tl/cuda/allreduce/allreduce.h
@@ -36,6 +36,22 @@ ucc_status_t ucc_tl_cuda_allreduce_nvls_init(ucc_base_coll_args_t *coll_args,
                                              ucc_base_team_t      *team,
                                              ucc_coll_task_t     **task_h);
 
+static inline int
+ucc_tl_cuda_allreduce_nvls_dt_supported(ucc_datatype_t dt)
+{
+    switch (dt) {
+    case UCC_DT_FLOAT32:
+    case UCC_DT_BFLOAT16:
+    case UCC_DT_INT32:
+    case UCC_DT_UINT32:
+    case UCC_DT_INT64:
+    case UCC_DT_UINT64:
+        return 1;
+    default:
+        return 0;
+    }
+}
+
 static inline int ucc_tl_cuda_allreduce_alg_from_str(const char *str)
 {
     int i;
diff --git a/src/components/tl/cuda/allreduce/allreduce_nvls.c b/src/components/tl/cuda/allreduce/allreduce_nvls.c
@@ -186,13 +186,13 @@ ucc_status_t ucc_tl_cuda_allreduce_nvls_init(
     ucc_status_t        status;
 
     if (buf_size < 1024 || coll_args->args.op != UCC_OP_SUM ||
-        (coll_args->args.dst.info.datatype != UCC_DT_FLOAT32 &&
-         coll_args->args.dst.info.datatype != UCC_DT_BFLOAT16)) {
+        !ucc_tl_cuda_allreduce_nvls_dt_supported(
+            coll_args->args.dst.info.datatype)) {
         tl_debug(
             UCC_TL_TEAM_LIB(team),
             "NVLS allreduce is supported only with SUM operation "
-            "and float32 or bfloat16 datatype, with message size >= 1024 "
-            "bytes");
+            "and float32, bfloat16, int32, uint32, int64, or uint64 "
+            "datatype, with message size >= 1024 bytes");
         return UCC_ERR_NOT_SUPPORTED;
     }
     if (ucc_unlikely(
diff --git a/src/components/tl/cuda/kernels/allreduce_kernel.cu b/src/components/tl/cuda/kernels/allreduce_kernel.cu
@@ -48,6 +48,72 @@ __global__ void __launch_bounds__(UCC_TL_CUDA_MAX_NVLS_THREADS)
     nvls_bar(&(mc_bar->arrival_counter), &(uc_bar->arrival_counter), total_blocks * (launch_counter * 2 + 2));
 }
 
+template <typename NvlsOps>
+__global__ void __launch_bounds__(UCC_TL_CUDA_MAX_NVLS_THREADS)
+    allreduce_kernel_scalar32(ucc_tl_cuda_nvls_control_t *mc_bar,
+                              ucc_tl_cuda_nvls_control_t *uc_bar,
+                              const uint32_t total_blocks,
+                              uint64_t launch_counter,
+                              uint32_t *base_u32, size_t count_u32, uint32_t rank,
+                              uint32_t tsize)
+{
+    // pre barrier
+    nvls_bar(&(mc_bar->arrival_counter), &(uc_bar->arrival_counter), total_blocks * (launch_counter * 2 + 1));
+
+    // Kernel execution
+    size_t chunk_start = ((int64_t)count_u32 * (int64_t)rank) / (int64_t)tsize;
+    size_t chunk_end   = ((int64_t)count_u32 * (int64_t)(rank + 1)) / (int64_t)tsize;
+
+    size_t thread_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
+    size_t stride        = blockDim.x * gridDim.x * 4;
+
+    for (size_t idx = chunk_start + thread_offset; idx < chunk_end; idx += stride) {
+        typename NvlsOps::value_type v0, v1, v2, v3;
+        NvlsOps::ld(v0, base_u32 + idx + 0);
+        NvlsOps::ld(v1, base_u32 + idx + 1);
+        NvlsOps::ld(v2, base_u32 + idx + 2);
+        NvlsOps::ld(v3, base_u32 + idx + 3);
+        NvlsOps::st(v0, base_u32 + idx + 0);
+        NvlsOps::st(v1, base_u32 + idx + 1);
+        NvlsOps::st(v2, base_u32 + idx + 2);
+        NvlsOps::st(v3, base_u32 + idx + 3);
+    }
+
+    // post barrier
+    nvls_bar(&(mc_bar->arrival_counter), &(uc_bar->arrival_counter), total_blocks * (launch_counter * 2 + 2));
+}
+
+template <typename NvlsOps>
+__global__ void __launch_bounds__(UCC_TL_CUDA_MAX_NVLS_THREADS)
+    allreduce_kernel_scalar64(ucc_tl_cuda_nvls_control_t *mc_bar,
+                              ucc_tl_cuda_nvls_control_t *uc_bar,
+                              const uint32_t total_blocks,
+                              uint64_t launch_counter,
+                              uint64_t *base_u64, size_t count_u64, uint32_t rank,
+                              uint32_t tsize)
+{
+    // pre barrier
+    nvls_bar(&(mc_bar->arrival_counter), &(uc_bar->arrival_counter), total_blocks * (launch_counter * 2 + 1));
+
+    // Kernel execution
+    size_t chunk_start = ((int64_t)count_u64 * (int64_t)rank) / (int64_t)tsize;
+    size_t chunk_end   = ((int64_t)count_u64 * (int64_t)(rank + 1)) / (int64_t)tsize;
+
+    size_t thread_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    size_t stride        = blockDim.x * gridDim.x * 2;
+
+    for (size_t idx = chunk_start + thread_offset; idx < chunk_end; idx += stride) {
+        typename NvlsOps::value_type v0, v1;
+        NvlsOps::ld(v0, base_u64 + idx + 0);
+        NvlsOps::ld(v1, base_u64 + idx + 1);
+        NvlsOps::st(v0, base_u64 + idx + 0);
+        NvlsOps::st(v1, base_u64 + idx + 1);
+    }
+
+    // post barrier
+    nvls_bar(&(mc_bar->arrival_counter), &(uc_bar->arrival_counter), total_blocks * (launch_counter * 2 + 2));
+}
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -69,17 +135,40 @@ ucc_status_t post_allreduce_kernel(cudaStream_t stream, uint32_t sm_count,
     ucc_tl_cuda_nvls_control_t *uc_bar = reinterpret_cast<ucc_tl_cuda_nvls_control_t *>(uc_control_addr);
     uint32_t expected_blocks = sm_count * tsize; // total num of blocks in the multicast group, num gpus * num blocks per gpu, used for barrier synchronization
 
+    assert(((uintptr_t)(mc_base_addr) % 8) == 0);
     switch (datatype) {
     case UCC_DT_FLOAT32:
-        assert(((uintptr_t)(mc_base_addr) % 8) == 0);
         allreduce_kernel_vec32<NvlsFp32Ops><<<sm_count, threads, 0, stream>>>(
             mc_bar, uc_bar, expected_blocks, launch_counter, base_u32, count_u32, rank, tsize);
         break;
     case UCC_DT_BFLOAT16:
-        assert(((uintptr_t)(mc_base_addr) % 8) == 0);
         allreduce_kernel_vec32<NvlsBf16Ops><<<sm_count, threads, 0, stream>>>(
             mc_bar, uc_bar, expected_blocks, launch_counter, base_u32, count_u32, rank, tsize);
         break;
+    case UCC_DT_INT32:
+        allreduce_kernel_scalar32<NvlsInt32Ops><<<sm_count, threads, 0, stream>>>(
+            mc_bar, uc_bar, expected_blocks, launch_counter, base_u32, count_u32, rank, tsize);
+        break;
+    case UCC_DT_UINT32:
+        allreduce_kernel_scalar32<NvlsUint32Ops><<<sm_count, threads, 0, stream>>>(
+            mc_bar, uc_bar, expected_blocks, launch_counter, base_u32, count_u32, rank, tsize);
+        break;
+    case UCC_DT_INT64:
+        {
+            uint64_t *base_u64 = reinterpret_cast<uint64_t *>(mc_base_addr);
+            size_t count_u64 = src_size_bytes / sizeof(uint64_t);
+            allreduce_kernel_scalar64<NvlsInt64Ops><<<sm_count, threads, 0, stream>>>(
+                mc_bar, uc_bar, expected_blocks, launch_counter, base_u64, count_u64, rank, tsize);
+        }
+        break;
+    case UCC_DT_UINT64:
+        {
+            uint64_t *base_u64 = reinterpret_cast<uint64_t *>(mc_base_addr);
+            size_t count_u64 = src_size_bytes / sizeof(uint64_t);
+            allreduce_kernel_scalar64<NvlsUint64Ops><<<sm_count, threads, 0, stream>>>(
+                mc_bar, uc_bar, expected_blocks, launch_counter, base_u64, count_u64, rank, tsize);
+        }
+        break;
     default:
         return UCC_ERR_NOT_SUPPORTED;
     }
diff --git a/src/components/tl/cuda/kernels/nvls.cuh b/src/components/tl/cuda/kernels/nvls.cuh
@@ -81,6 +81,67 @@ struct NvlsBf16Ops {
         MULTIMEM_ST_BF16(v, ptr);
     }
 };
+
+struct NvlsInt32Ops {
+    using value_type = int;
+    __device__ static inline void ld(int &v, const uint32_t *ptr) {
+        asm("multimem.ld_reduce.global.add.s32 %0, [%1];"
+            : "=r"(v)
+            : "l"(ptr)
+            : "memory");
+    }
+    __device__ static inline void st(const int &v, uint32_t *ptr) {
+        asm volatile("multimem.st.global.s32 [%0], %1;" ::"l"(ptr),
+                     "r"(v)
+                     : "memory");
+    }
+};
+
+struct NvlsUint32Ops {
+    using value_type = unsigned int;
+    __device__ static inline void ld(unsigned int &v, const uint32_t *ptr) {
+        asm("multimem.ld_reduce.global.add.u32 %0, [%1];"
+            : "=r"(v)
+            : "l"(ptr)
+            : "memory");
+    }
+    __device__ static inline void st(const unsigned int &v, uint32_t *ptr) {
+        asm volatile("multimem.st.global.u32 [%0], %1;" ::"l"(ptr),
+                     "r"(v)
+                     : "memory");
+    }
+};
+
+// PTX does not support s64 with add operation, so we use u64 instead
+struct NvlsInt64Ops {
+    using value_type = unsigned long long;
+    __device__ static inline void ld(unsigned long long &v, const uint64_t *ptr) {
+        asm("multimem.ld_reduce.global.add.u64 %0, [%1];"
+            : "=l"(v)
+            : "l"(ptr)
+            : "memory");
+    }
+    __device__ static inline void st(const unsigned long long &v, uint64_t *ptr) {
+        asm volatile("multimem.st.global.u64 [%0], %1;" ::"l"(ptr),
+                     "l"(v)
+                     : "memory");
+    }
+};
+
+struct NvlsUint64Ops {
+    using value_type = unsigned long long;
+    __device__ static inline void ld(unsigned long long &v, const uint64_t *ptr) {
+        asm("multimem.ld_reduce.global.add.u64 %0, [%1];"
+            : "=l"(v)
+            : "l"(ptr)
+            : "memory");
+    }
+    __device__ static inline void st(const unsigned long long &v, uint64_t *ptr) {
+        asm volatile("multimem.st.global.u64 [%0], %1;" ::"l"(ptr),
+                     "l"(v)
+                     : "memory");
+    }
+};
 #endif // __cplusplus
 
 #endif // UCC_TL_CUDA_NVLS_CUH_
diff --git a/src/components/tl/cuda/kernels/reduce_scatter_kernel.cu b/src/components/tl/cuda/kernels/reduce_scatter_kernel.cu
@@ -32,7 +32,7 @@ __global__ void __launch_bounds__(UCC_TL_CUDA_MAX_NVLS_THREADS)
     size_t thread_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
     size_t stride        = blockDim.x * gridDim.x * 4;
 
-    for (size_t idx = offset + thread_offset; idx < offset + count;
+    for (size_t idx = offset + thread_offset; idx + 3 < offset + count;
          idx += stride) {
         uint4 val;
         NvlsOps::ld(val, base_u32 + idx);
@@ -47,6 +47,74 @@ __global__ void __launch_bounds__(UCC_TL_CUDA_MAX_NVLS_THREADS)
         total_blocks * (launch_counter * 2 + 2));
 }
 
+template <typename NvlsOps>
+__global__ void __launch_bounds__(UCC_TL_CUDA_MAX_NVLS_THREADS)
+    reduce_scatter_kernel_scalar32(
+        ucc_tl_cuda_nvls_control_t *mc_bar, ucc_tl_cuda_nvls_control_t *uc_bar,
+        const uint32_t total_blocks, uint64_t launch_counter,
+        uint32_t *base_u32, size_t offset, size_t count, uint32_t *dst_u32)
+{
+    // pre barrier
+    nvls_bar(
+        &(mc_bar->arrival_counter),
+        &(uc_bar->arrival_counter),
+        total_blocks * (launch_counter * 2 + 1));
+
+    size_t thread_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 4;
+    size_t stride        = blockDim.x * gridDim.x * 4;
+
+    for (size_t idx = offset + thread_offset; idx + 3 < offset + count;
+         idx += stride) {
+        typename NvlsOps::value_type v0, v1, v2, v3;
+        NvlsOps::ld(v0, base_u32 + idx + 0);
+        NvlsOps::ld(v1, base_u32 + idx + 1);
+        NvlsOps::ld(v2, base_u32 + idx + 2);
+        NvlsOps::ld(v3, base_u32 + idx + 3);
+        dst_u32[idx - offset + 0] = v0;
+        dst_u32[idx - offset + 1] = v1;
+        dst_u32[idx - offset + 2] = v2;
+        dst_u32[idx - offset + 3] = v3;
+    }
+
+    // post barrier
+    nvls_bar(
+        &(mc_bar->arrival_counter),
+        &(uc_bar->arrival_counter),
+        total_blocks * (launch_counter * 2 + 2));
+}
+
+template <typename NvlsOps>
+__global__ void __launch_bounds__(UCC_TL_CUDA_MAX_NVLS_THREADS)
+    reduce_scatter_kernel_scalar64(
+        ucc_tl_cuda_nvls_control_t *mc_bar, ucc_tl_cuda_nvls_control_t *uc_bar,
+        const uint32_t total_blocks, uint64_t launch_counter,
+        uint64_t *base_u64, size_t offset, size_t count, uint64_t *dst_u64)
+{
+    // pre barrier
+    nvls_bar(
+        &(mc_bar->arrival_counter),
+        &(uc_bar->arrival_counter),
+        total_blocks * (launch_counter * 2 + 1));
+
+    size_t thread_offset = (threadIdx.x + blockIdx.x * blockDim.x) * 2;
+    size_t stride        = blockDim.x * gridDim.x * 2;
+
+    for (size_t idx = offset + thread_offset; idx + 1 < offset + count;
+         idx += stride) {
+        typename NvlsOps::value_type v0, v1;
+        NvlsOps::ld(v0, base_u64 + idx + 0);
+        NvlsOps::ld(v1, base_u64 + idx + 1);
+        dst_u64[idx - offset + 0] = v0;
+        dst_u64[idx - offset + 1] = v1;
+    }
+
+    // post barrier
+    nvls_bar(
+        &(mc_bar->arrival_counter),
+        &(uc_bar->arrival_counter),
+        total_blocks * (launch_counter * 2 + 2));
+}
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -101,6 +169,64 @@ ucc_status_t post_reduce_scatter_kernel(
                 count,
                 reinterpret_cast<uint32_t *>(dst_ptr));
         break;
+    case UCC_DT_INT32:
+        reduce_scatter_kernel_scalar32<NvlsInt32Ops>
+            <<<sm_count, threads, 0, stream>>>(
+                mc_bar,
+                uc_bar,
+                expected_blocks,
+                launch_counter,
+                base_u32,
+                offset,
+                count,
+                reinterpret_cast<uint32_t *>(dst_ptr));
+        break;
+    case UCC_DT_UINT32:
+        reduce_scatter_kernel_scalar32<NvlsUint32Ops>
+            <<<sm_count, threads, 0, stream>>>(
+                mc_bar,
+                uc_bar,
+                expected_blocks,
+                launch_counter,
+                base_u32,
+                offset,
+                count,
+                reinterpret_cast<uint32_t *>(dst_ptr));
+        break;
+    case UCC_DT_INT64:
+        {
+            uint64_t *base_u64 = reinterpret_cast<uint64_t *>(mc_base_addr);
+            size_t offset_u64 = offset / 2;
+            size_t count_u64 = count / 2;
+            reduce_scatter_kernel_scalar64<NvlsInt64Ops>
+                <<<sm_count, threads, 0, stream>>>(
+                    mc_bar,
+                    uc_bar,
+                    expected_blocks,
+                    launch_counter,
+                    base_u64,
+                    offset_u64,
+                    count_u64,
+                    reinterpret_cast<uint64_t *>(dst_ptr));
+        }
+        break;
+    case UCC_DT_UINT64:
+        {
+            uint64_t *base_u64 = reinterpret_cast<uint64_t *>(mc_base_addr);
+            size_t offset_u64 = offset / 2;
+            size_t count_u64 = count / 2;
+            reduce_scatter_kernel_scalar64<NvlsUint64Ops>
+                <<<sm_count, threads, 0, stream>>>(
+                    mc_bar,
+                    uc_bar,
+                    expected_blocks,
+                    launch_counter,
+                    base_u64,
+                    offset_u64,
+                    count_u64,
+                    reinterpret_cast<uint64_t *>(dst_ptr));
+        }
+        break;
     default:
         return UCC_ERR_NOT_SUPPORTED;
     }
diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_nvls.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_nvls.c