Limit the grid size for the TBE forward kernel

q10 · facebook-github-bot · commit c1323b7612b0 · 2025-05-28T18:13:47.000-07:00
Summary:
- Limit the grid size for the TBE forward kernel, as we are observing thread
  counts over 2^32 for AMD runs

Reviewed By: yoyoyocmu, r-barnes

Differential Revision: D75543767
diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_kernel_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_kernel_template.cu
@@ -645,11 +645,12 @@ batch_index_select_dim0_codegen_forward_kernel(
     {%- endif %}
 
     // Determine the linearized warp ID, and exit early if needed
+    {%- if is_index_select %}
     auto b_t = blockIdx.x * blockDim.y + threadIdx.y;
-    {%- if not is_index_select %}
-    if (b_t >= offsets.size(0) - 1) {
-        return;
-    }
+    {%- else %}
+    const auto total_B = offsets.size(0) - 1;
+    // Since we place a limit on the grid size, we need to perform grid-striding
+    for (auto b_t = blockIdx.x * blockDim.y + threadIdx.y; b_t < total_B; b_t += blockDim.y * gridDim.x) {
     {%- endif %}
 
     // Determine the Table and Training Example IDs
@@ -832,6 +833,10 @@ batch_index_select_dim0_codegen_forward_kernel(
 
     }
     {%- endif %}
+
+    {%- if not is_index_select %}
+    } // for b_t
+    {%- endif %}
 }
 
 
diff --git a/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu b/fbgemm_gpu/codegen/training/forward/embedding_forward_split_template.cu
@@ -37,6 +37,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 #include "fbgemm_gpu/utils/ops_utils.h"
 {%- endif %}
+#include "fbgemm_gpu/utils/device_properties.cuh"
 #include "fbgemm_gpu/utils/kernel_launcher.cuh"
 #include "fbgemm_gpu/embedding_forward_template_helpers.cuh"
 #include "fbgemm_gpu/split_embeddings_cache_cuda.cuh"
@@ -708,6 +709,10 @@ batch_index_select_dim0_codegen_forward_cuda(
             constexpr auto kMaxVecsPerThread = kFixedMaxVecsPerThread;
             {%- endif %}
 
+            const auto grid = min(
+              div_round_up(total_B, kForwardMaxThreads / kThreadGroupSize),
+              utils::cuda::get_max_thread_blocks(at::cuda::getCurrentCUDAStream()));
+
             FBGEMM_LAUNCH_KERNEL(
               ({{ mdesc }}_embedding_codegen_forward_{{ desc_suffix }}_kernel
                 <emb_t,
@@ -719,7 +724,7 @@ batch_index_select_dim0_codegen_forward_cuda(
                 index_t,
                 kMaxVecsPerThread,
                 kThreadGroupSize>),
-              div_round_up(total_B, kForwardMaxThreads / kThreadGroupSize),
+              grid,
               dim3(kThreadGroupSize, kForwardMaxThreads / kThreadGroupSize),
               0,
               at::cuda::getCurrentCUDAStream(),
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/device_properties.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/device_properties.cuh
@@ -8,10 +8,22 @@
 
 #pragma once
 
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAException.h>
+#include <c10/cuda/CUDAStream.h>
 #include <cuda.h>
 
-namespace fbgemm_gpu::utils {
+namespace fbgemm_gpu::utils::cuda {
+
+// Based on the empirical study, max grid size that is 64x larger than the
+// number of SMs gives good performance across the board
+constexpr int32_t MAX_THREAD_BLOCKS_FACTOR = 64;
+
+inline auto get_max_thread_blocks(const c10::cuda::CUDAStream& stream) {
+  const auto device = stream.device_index();
+  return MAX_THREAD_BLOCKS_FACTOR *
+      at::cuda::getDeviceProperties(device)->multiProcessorCount;
+}
 
 inline auto get_compute_versions() {
   static const auto versions = [] {
@@ -27,4 +39,4 @@ inline auto get_compute_versions() {
   return versions;
 }
 
-} // namespace fbgemm_gpu::utils
+} // namespace fbgemm_gpu::utils::cuda
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh
@@ -180,7 +180,13 @@ struct KernelLauncher {
     TORCH_CHECK(
         threads_per_block <= properties.maxThreadsPerBlock,
         context.description(),
-        ": Threads per block ",
+        ": [block dim ",
+        block.x,
+        " x ",
+        block.y,
+        " x ",
+        block.z,
+        "] Threads per block ",
         threads_per_block,
         " is greater than the limit of ",
         properties.maxThreadsPerBlock);
@@ -190,15 +196,28 @@ struct KernelLauncher {
     // automatically work around problem like CUDA does (V100 or newer
     // architectures), see:
     //    https://github.com/ROCm/hip/issues/2253
+    //    https://rocm.docs.amd.com/projects/HIP/en/docs-develop/reference/hip_runtime_api/modules/occupancy.html
     const uint64_t total_threads = U64(grid.x) * U64(grid.y) * U64(grid.z) *
         U64(block.x) * U64(block.y) * U64(block.z);
 
     TORCH_CHECK(
         total_threads < U64(std::numeric_limits<uint32_t>::max()),
         context.description(),
-        ": Total number of threads ",
+        " [grid dim ",
+        grid.x,
+        " x ",
+        grid.y,
+        " x ",
+        grid.z,
+        "] [block dim ",
+        block.x,
+        " x ",
+        block.y,
+        " x ",
+        block.z,
+        "]: Total number of threads ",
         total_threads,
-        " is greater than the limit of 2^32");
+        " is greater than the HIP limit of 2^32");
 #endif
   }