Refactor bounds_check_indices (pytorch#4049)

sryap · facebook-github-bot · commit e7d98ca4872a · 2025-05-05T15:40:07.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1140 Pull Request resolved: pytorch#4049 This diff refactors the `bounds_check_indices` operator: - Moved the common host code into the common host function - Moved the common device code into `embedding_bounds_check_common.cuh` - Removed `const` from scalar args Reviewed By: q10 Differential Revision: D73905710 fbshipit-source-id: 9c0d7c94b617b1a453904e0170cc3da5d9ab9bf0
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_host.cpp b/fbgemm_gpu/codegen/utils/embedding_bounds_check_host.cpp
@@ -13,6 +13,7 @@
 #include <torch/library.h>
 
 #include "fbgemm_gpu/utils/ops_utils.h"
+#include "fbgemm_gpu/utils/tensor_utils.h"
 
 #include "fbgemm_gpu/config/feature_gates.h"
 #include "fbgemm_gpu/embedding_common.h"
@@ -25,29 +26,37 @@ void _bounds_check_indices_cuda_v1(
     Tensor& rows_per_table,
     Tensor& indices,
     Tensor& offsets,
-    int64_t bounds_check_mode,
+    fbgemm_gpu::BoundsCheckMode bounds_check_mode,
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t max_B,
+    int64_t max_B,
     const std::optional<Tensor>& b_t_map,
-    const int32_t info_B_num_bits,
-    const uint32_t info_B_mask,
-    const bool prefetch_pipeline);
+    int32_t info_B_num_bits,
+    uint32_t info_B_mask,
+    int64_t T,
+    int64_t B,
+    int64_t total_B,
+    bool vbe,
+    bool prefetch_pipeline);
 
 void _bounds_check_indices_cuda_v2(
     Tensor& rows_per_table,
     Tensor& indices,
     Tensor& offsets,
-    int64_t bounds_check_mode,
+    fbgemm_gpu::BoundsCheckMode bounds_check_mode,
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t max_B,
+    int64_t max_B,
     const std::optional<Tensor>& b_t_map,
-    const int32_t info_B_num_bits,
-    const uint32_t info_B_mask,
-    const bool prefetch_pipeline);
+    int32_t info_B_num_bits,
+    uint32_t info_B_mask,
+    int64_t T,
+    int64_t B,
+    int64_t total_B,
+    bool vbe,
+    bool prefetch_pipeline);
 
 ///@ingroup embedding-cuda
 void bounds_check_indices_cuda(
@@ -58,12 +67,12 @@ void bounds_check_indices_cuda(
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t max_B,
+    int64_t max_B,
     const std::optional<Tensor>& b_t_map,
-    const int64_t info_B_num_bits,
-    const int64_t info_B_mask,
-    const int8_t bounds_check_version,
-    const bool prefetch_pipeline) {
+    int64_t info_B_num_bits,
+    int64_t info_B_mask,
+    int8_t bounds_check_version,
+    bool prefetch_pipeline) {
   TORCH_CHECK(bounds_check_version == 1 || bounds_check_version == 2);
   const static bool use_v2 =
       fbgemm_gpu::config::is_feature_enabled(
@@ -73,25 +82,65 @@ void bounds_check_indices_cuda(
       use_v2 ? _bounds_check_indices_cuda_v2 : _bounds_check_indices_cuda_v1;
   const auto bounds_check_mode_ =
       static_cast<fbgemm_gpu::BoundsCheckMode>(bounds_check_mode);
+
   TORCH_CHECK(
       bounds_check_mode_ == fbgemm_gpu::BoundsCheckMode::WARNING ||
           bounds_check_mode_ == fbgemm_gpu::BoundsCheckMode::FATAL ||
           bounds_check_mode_ == fbgemm_gpu::BoundsCheckMode::IGNORE,
       "bounds_check_indices: bounds_check_mode=",
       bounds_check_mode,
       " is not supported");
+
+  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
+      rows_per_table, indices, offsets, warning, weights, B_offsets, b_t_map);
+
+  TENSOR_NDIM_EQUALS(rows_per_table, 1);
+  TENSOR_NDIM_EQUALS(indices, 1);
+  TENSOR_NDIM_EQUALS(offsets, 1);
+  TENSOR_NDIM_EQUALS(warning, 1);
+
+  const auto T = rows_per_table.size(0);
+  const auto total_B = offsets.size(0) - 1;
+  const auto B = total_B / T;
+  if (total_B == 0 || T == 0) {
+    return;
+  }
+
+  const auto vbe = B_offsets.has_value();
+  if (vbe) {
+    TENSOR_NDIM_EQUALS(B_offsets.value(), 1);
+    TORCH_CHECK(max_B >= 0);
+  } else if (!vbe) {
+    TORCH_CHECK(
+        offsets.size(0) == B * T + 1,
+        "offsets size " + std::to_string(offsets.size(0)) +
+            " is not equal to B (" + std::to_string(B) + ") * T (" +
+            std::to_string(T) + ") + 1");
+  }
+  if (weights.has_value() && weights->numel() != 0) {
+    const auto num_indices = indices.size(0);
+    TORCH_CHECK(
+        weights->size(0) == num_indices,
+        "weights size " + std::to_string(weights->size(0)) +
+            " is not equal to indices size " + std::to_string(num_indices));
+  }
+
   bounds_check_indices_fn(
       rows_per_table,
       indices,
       offsets,
-      bounds_check_mode,
+      bounds_check_mode_,
       warning,
       weights,
       B_offsets,
       max_B,
       b_t_map,
       static_cast<int32_t>(info_B_num_bits),
       static_cast<uint32_t>(info_B_mask),
+      T,
+      B,
+      total_B,
+      vbe,
       prefetch_pipeline);
 }
 // Deprecated for fb namespace! Please use fbgemm namespace instead!
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v1.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v1.cu
@@ -6,28 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
-#include "fbgemm_gpu/utils/tensor_accessor_builder.h"
-
-#include <c10/cuda/CUDADeviceAssertion.h>
-#include <c10/cuda/CUDAException.h>
-
-using Tensor = at::Tensor;
-using namespace fbgemm_gpu;
-
-template <typename index_t>
-__device__ void adjust_offset_kernel(
-    index_t& indices_start,
-    index_t& indices_end,
-    const index_t num_indices,
-    index_t* const offset_acc_start,
-    index_t* const offset_acc_end) {
-  indices_start =
-      std::max(static_cast<index_t>(0), std::min(indices_start, num_indices));
-  indices_end = std::max(indices_start, std::min(indices_end, num_indices));
-  *offset_acc_start = indices_start;
-  *offset_acc_end = indices_end;
-}
+#include "fbgemm_gpu/utils/embedding_bounds_check_common.cuh"
 
 template <typename index_t, bool vbe>
 __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v1(
@@ -37,7 +16,7 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v1(
     pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
     const int32_t* const B_offsets, // Use a raw pointer to avoid creating a
                                     // dummy PackedTensorAccessor
-    const int64_t bounds_check_mode_,
+    BoundsCheckMode bounds_check_mode,
     pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> warning,
     FixedDivisor fd,
     TORCH_DSA_KERNEL_ARGS) {
@@ -71,8 +50,6 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v1(
     B = total_B / T;
   }
 
-  const auto bounds_check_mode =
-      static_cast<BoundsCheckMode>(bounds_check_mode_);
   const auto num_rows = rows_per_table[t];
   auto indices_start = offsets[b_t];
   auto indices_end = offsets[b_t + 1];
@@ -179,70 +156,32 @@ __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v1(
   }
 }
 
-void check_weights_dim_matches_indices(
-    const std::optional<Tensor>& weights,
-    int64_t num_indices) {
-  if (weights.has_value() && weights->numel() != 0) {
-    TORCH_CHECK(
-        weights.value().size(0) == num_indices,
-        "weights size " + std::to_string(weights.value().size(0)) +
-            " is not equal to indices size " + std::to_string(num_indices));
-  }
-}
-
 void _bounds_check_indices_cuda_v1(
     Tensor& rows_per_table,
     Tensor& indices,
     Tensor& offsets,
-    int64_t bounds_check_mode_,
+    BoundsCheckMode bounds_check_mode,
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t max_B,
+    int64_t max_B,
     const std::optional<Tensor>& /*b_t_map*/,
-    const int32_t /*info_b_num_bits*/,
-    const uint32_t /*info_B_mask*/,
-    const bool prefetch_pipeline) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      rows_per_table, indices, offsets, warning, weights, B_offsets);
-  TENSOR_NDIM_EQUALS(rows_per_table, 1);
-  TENSOR_NDIM_EQUALS(indices, 1);
-  TENSOR_NDIM_EQUALS(offsets, 1);
-  TENSOR_NDIM_EQUALS(warning, 1);
+    int32_t /*info_b_num_bits*/,
+    uint32_t /*info_B_mask*/,
+    int64_t T,
+    int64_t B,
+    int64_t /*total_B*/,
+    bool vbe,
+    bool prefetch_pipeline) {
   TORCH_CHECK(
       !prefetch_pipeline,
       "bounds_check_indices_v1 does not support prefetch_pipeline=true")
 
-  const auto vbe = B_offsets.has_value();
-  if (vbe) {
-    TENSOR_NDIM_EQUALS(B_offsets.value(), 1);
-  }
-
   CUDA_DEVICE_GUARD(rows_per_table);
 
-  const int32_t T = rows_per_table.size(0);
-  const int32_t total_B = offsets.size(0) - 1;
-  const int32_t B = (total_B) / T;
-  if (total_B == 0 || T == 0) {
-    return;
-  }
-  const auto bounds_check_mode =
-      static_cast<BoundsCheckMode>(bounds_check_mode_);
   if (bounds_check_mode == BoundsCheckMode::WARNING) {
     warning.zero_();
   }
-  const int64_t num_indices = indices.size(0);
-
-  if (vbe) {
-    TORCH_CHECK(max_B >= 0);
-  } else {
-    TORCH_CHECK(
-        offsets.size(0) == B * T + 1,
-        "offsets size " + std::to_string(offsets.size(0)) +
-            " is not equal to B (" + std::to_string(B) + ") * T (" +
-            std::to_string(T) + ") + 1");
-  }
-  check_weights_dim_matches_indices(weights, num_indices);
 
   constexpr size_t kNumThreads = 256;
   const auto max_B_ = vbe ? max_B : B;
@@ -265,7 +204,7 @@ void _bounds_check_indices_cuda_v1(
             MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 32),
             MAKE_PTA_WITH_NAME(func_name, offsets, index_t, 1, 32),
             vbe ? B_offsets.value().data_ptr<int32_t>() : nullptr,
-            bounds_check_mode_,
+            bounds_check_mode,
             MAKE_PTA_WITH_NAME(func_name, warning, int64_t, 1, 32),
             FixedDivisor(max_B_));
       });
diff --git a/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu b/fbgemm_gpu/codegen/utils/embedding_bounds_check_v2.cu
@@ -6,30 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
-#include "fbgemm_gpu/utils/tensor_accessor_builder.h"
-
-#include <c10/cuda/CUDADeviceAssertion.h>
-#include <c10/cuda/CUDAException.h>
-
-using Tensor = at::Tensor;
-using namespace fbgemm_gpu;
-
-template <typename index_t>
-__device__ void adjust_offset_kernel(
-    index_t& indices_start,
-    index_t& indices_end,
-    const index_t num_indices,
-    index_t* const offset_acc_start,
-    index_t* const offset_acc_end) {
-  indices_start =
-      std::max(static_cast<index_t>(0), std::min(indices_start, num_indices));
-  indices_end = std::max(indices_start, std::min(indices_end, num_indices));
-  if (threadIdx.x == 0) {
-    *offset_acc_start = indices_start;
-    *offset_acc_end = indices_end;
-  }
-}
+#include "fbgemm_gpu/utils/embedding_bounds_check_common.cuh"
 
 template <typename index_t, bool vbe, BoundsCheckMode bounds_check_mode>
 __global__ __launch_bounds__(kMaxThreads) void bounds_check_indices_kernel_v2(
@@ -195,57 +172,29 @@ void _bounds_check_indices_cuda_v2(
     Tensor& rows_per_table,
     Tensor& indices,
     Tensor& offsets,
-    int64_t bounds_check_mode_,
+    BoundsCheckMode bounds_check_mode,
     Tensor& warning,
     const std::optional<Tensor>& weights,
     const std::optional<Tensor>& B_offsets,
-    const int64_t /*max_B*/,
+    int64_t /*max_B*/,
     const std::optional<Tensor>& b_t_map,
-    const int32_t info_B_num_bits,
-    const uint32_t info_B_mask,
-    const bool prefetch_pipeline) {
-  TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
-      rows_per_table, indices, offsets, warning, weights, B_offsets, b_t_map);
-  TENSOR_NDIM_EQUALS(rows_per_table, 1);
-  TENSOR_NDIM_EQUALS(indices, 1);
-  TENSOR_NDIM_EQUALS(offsets, 1);
-  TENSOR_NDIM_EQUALS(warning, 1);
-
-  const auto vbe = B_offsets.has_value();
+    int32_t info_B_num_bits,
+    uint32_t info_B_mask,
+    int64_t /*T*/,
+    int64_t B,
+    int64_t total_B,
+    bool vbe,
+    bool prefetch_pipeline) {
   if (vbe) {
-    TENSOR_NDIM_EQUALS(B_offsets.value(), 1);
     TORCH_CHECK(b_t_map.has_value());
     TENSOR_NDIM_EQUALS(b_t_map.value(), 1);
   }
 
   CUDA_DEVICE_GUARD(rows_per_table);
 
-  const int32_t T = rows_per_table.size(0);
-  const int32_t total_B = offsets.size(0) - 1;
-  const int32_t B = (total_B) / T;
-  if (total_B == 0 || T == 0) {
-    return;
-  }
-  const auto bounds_check_mode =
-      static_cast<BoundsCheckMode>(bounds_check_mode_);
   if (bounds_check_mode == BoundsCheckMode::WARNING) {
     warning.zero_();
   }
-  const int64_t num_indices = indices.size(0);
-
-  if (!vbe) {
-    TORCH_CHECK(
-        offsets.size(0) == B * T + 1,
-        "offsets size " + std::to_string(offsets.size(0)) +
-            " is not equal to B (" + std::to_string(B) + ") * T (" +
-            std::to_string(T) + ") + 1");
-  }
-  if (weights.has_value()) {
-    TORCH_CHECK(
-        weights.value().size(0) == num_indices,
-        "weights size " + std::to_string(weights.value().size(0)) +
-            " is not equal to indices size " + std::to_string(num_indices));
-  }
 
   constexpr size_t kNumThreads = 1024;
   auto grid_dim =
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/embedding_bounds_check_common.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/embedding_bounds_check_common.cuh