Update the rowwise adagrad optimizer to leverage optimizer state offloading, v4, backend (pytorch#4195)

q10 · facebook-github-bot · commit 57c932de6690 · 2025-05-28T10:16:09.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1271 Update the rowwise adagrad optimizer to leverage optimizer state offloading, v4. It is a revision of D74827718 to make the flag an SSD-specific flag, as opposed to optimizer-specific flag. By making this an SSD-specific flag, we are expressing clear intent on the flag's use. This diff adds support for leveraging optimizer state offloading to make optimizer state updates, starting with the rowwise adagrad optimizer. - Add ssd-specific flag `enable_optimizer_offloading` to the table update kernel to enable handling optimizer offloading, starting with the rowwise adagrad case - Propagate the flag upwards to `torch.ops.fbgemm.{{ mdesc }}_embedding_codegen_lookup_{{ optimizer }}_function_pt2` Differential Revision: D75329024
diff --git a/fbgemm_gpu/codegen/genscript/optimizers.py b/fbgemm_gpu/codegen/genscript/optimizers.py
@@ -186,15 +186,33 @@ def rowwise_adagrad() -> Dict[str, Any]:
         g_local_sum_square += gx * gx + gy * gy + gz * gz + gw * gw;
     """
     )
-    split_precomputation += """
+    split_precomputation += """	
+	// Define the rowwise adagrad optimizer state struct view
+    struct [[maybe_unused]] OptimizerState {
+        at::acc_type<cache_t, true> momentum;
+    };
+
     const at::acc_type<cache_t, true> g_avg_square =
         GROUP_REDUCE_ALL_SUM(g_local_sum_square, at::acc_type<cache_t, true>) / D;
 
     at::acc_type<cache_t, true> multiplier = 0.0;
     at::acc_type<cache_t, true> correction = 0.0;
-    if (threadIdx.x == 0) {
-        at::acc_type<cache_t, true> new_sum_square_grads = momentum1[idx] + g_avg_square;
-        momentum1[idx] = new_sum_square_grads;
+    if (threadIdx.x == 0) {	
+        auto new_sum_square_grads = g_avg_square;
+	
+        // Update the optimizer state.  Use optimizer state offloading only if 
+        // SSD and if enabled by the user
+        if (enable_optimizer_offloading) {
+            // Fetch the pointer to the optimizer state along the cache row
+            auto* optimizer = weight_row_template.template optimizer_state_ptr<OptimizerState>();
+            new_sum_square_grads += optimizer->momentum;
+            optimizer->momentum = new_sum_square_grads;
+        
+        } else {
+            new_sum_square_grads += momentum1[idx];
+            momentum1[idx] = new_sum_square_grads;
+        }
+
         multiplier = learning_rate / (sqrtf(new_sum_square_grads) + eps);
         if (weight_decay_mode == 1) {
             // L2 regularization
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_host_template.cpp
@@ -187,6 +187,9 @@ enum SSDTensor {
           use_uniq_cache_locations_bwd,
           use_homogeneous_placements,
           {%- endif %}
+          {%- if ssd %}
+          enable_optimizer_offloading,
+          {%- endif %}
           {%- if is_gwd %}
           {%- if "prev_iter_dev" not in args.split_function_arg_names %}
           prev_iter_dev,
@@ -350,6 +353,9 @@ enum SSDTensor {
           is_experimental,
           use_uniq_cache_locations_bwd,
           use_homogeneous_placements,
+          {%- if ssd %}
+          enable_optimizer_offloading,
+          {%- endif %}      
           {%- if is_gwd %}
           {%- if "prev_iter_dev" not in args.split_function_arg_names %}
           prev_iter_dev,
@@ -520,6 +526,9 @@ Tensor
     {%- if not dense %}
     const bool use_uniq_cache_locations,
     const bool use_homogeneous_placements,
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {%- endif %}
     {%- if is_gwd %}
     {%- if "prev_iter_dev" not in args.split_function_arg_names %}
@@ -609,6 +618,9 @@ class {{ autograd_func }} :
     const bool is_experimental,
     const bool use_uniq_cache_locations_bwd,
     const bool use_homogeneous_placements,
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {%- if is_gwd %}
     {%- if "prev_iter_dev" not in args.split_function_arg_names %}
     const std::optional<Tensor>& prev_iter_dev,
@@ -783,6 +795,11 @@ class {{ autograd_func }} :
     ctx->saved_data["use_uniq_cache_locations_bwd"] = use_uniq_cache_locations_bwd;
     ctx->saved_data["use_homogeneous_placements"] = use_homogeneous_placements;
     {%- endif %}
+
+    {%- if ssd %}
+    ctx->saved_data["enable_optimizer_offloading"] = enable_optimizer_offloading;
+    {%- endif %}
+
     {%- if is_gwd %}
     {%- if "iter" not in args.split_function_arg_names %}
     ctx->saved_data["iter"] = iter;
@@ -900,6 +917,11 @@ class {{ autograd_func }} :
     const auto use_homogeneous_placements =
       ctx->saved_data["use_homogeneous_placements"].toBool();
     {%- endif %}
+    
+    {%- if ssd %}
+    const auto enable_optimizer_offloading = 
+      ctx->saved_data["enable_optimizer_offloading"].toBool();
+    {%- endif %}
 
     {%- if is_gwd %}
     {%- if "iter" not in args.split_function_arg_names %}
@@ -1065,6 +1087,9 @@ Tensor {{ bwd_mdesc }}_embedding_codegen_lookup_{{ optimizer }}_function(
     const bool is_experimental_tbe = false, // formerly named is_experimental
     const bool use_uniq_cache_locations_bwd = false,
     const bool use_homogeneous_placements = false,
+    {%- if ssd %}
+    const bool enable_optimizer_offloading = false,
+    {%- endif %}
     const std::optional<Tensor>& uvm_cache_stats = std::nullopt,
     {%- if "prev_iter_dev" not in args.split_function_arg_names %}
     const std::optional<Tensor>& prev_iter_dev = std::nullopt,
@@ -1185,6 +1210,9 @@ TORCH_LIBRARY_FRAGMENT({{ lib_name }}, m) {
           "    bool is_experimental=False, "
           "    bool use_uniq_cache_locations_bwd=False, "
           "    bool use_homogeneous_placements=False, "
+          {%- if ssd %}
+          "    bool enable_optimizer_offloading=False, "
+          {%- endif %}
           "    Tensor? uvm_cache_stats=None, "
           {%- if "prev_iter_dev" not in args.split_function_arg_names %}
           "    Tensor? prev_iter_dev=None, "
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_cta_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_cta_template.cu
@@ -155,6 +155,9 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row(
     {%- endif %}
     const float gwd_lower_bound,
     {%- endif %}
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {%- if is_index_select %}
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> grad_offsets,
     const bool permute_output_dim_0_1
@@ -386,6 +389,9 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row(
               {%- endif %}
               shfl_sync_mask,
               max_vecs,
+              {%- if ssd %}
+              enable_optimizer_offloading,
+              {%- endif %}
               {{ args.split_kernel_arg_names | join(", ") }}
         );
         {%- else %}
@@ -523,6 +529,9 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row
     {%- endif %}
     const float gwd_lower_bound,
     {%- endif %}
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {%- if is_index_select %}
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> grad_offsets,
     const bool permute_output_dim_0_1
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_kernel_warp_template.cu
@@ -133,6 +133,9 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row(
     {%- endif %}
     const float gwd_lower_bound,
     {%- endif %}
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {%- if is_index_select %}
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> grad_offsets,
     const bool permute_output_dim_0_1
@@ -296,6 +299,9 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row(
               {%- endif %}
               shfl_sync_mask,
               max_vecs,
+              {%- if ssd %}
+              enable_optimizer_offloading,
+              {%- endif %}
               {{ args.split_kernel_arg_names | join(", ") }}
         );
         {%- else %}
@@ -426,6 +432,9 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row
     {%- endif %}
     const float gwd_lower_bound,
     {%- endif %}
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {%- if is_index_select %}
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> grad_offsets,
     const bool permute_output_dim_0_1
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_meta_template.cpp b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_meta_template.cpp
@@ -110,6 +110,9 @@ Tensor {{ mdesc }}_embedding{{ ndesc }}_backward_codegen_{{ optimizer }}_{{ desc
     const bool use_uniq_cache_locations,
     const bool use_homogeneous_placements,
     {%- endif %}
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {%- if is_index_select %}
     const Tensor& grad_offsets,
     const Tensor& total_L_offsets,
diff --git a/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu b/fbgemm_gpu/codegen/training/backward/embedding_backward_split_template.cu
@@ -130,6 +130,9 @@ batch_index_select_dim0_codegen_backward_kernel_cta_per_row(
     {%- endif %}
     const float gwd_lower_bound,
     {%- endif %}
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {%- if is_index_select %}
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> grad_offsets,
     const bool permute_output_dim_0_1
@@ -213,6 +216,9 @@ batch_index_select_dim0_codegen_backward_kernel_warp_per_row(
     {%- endif %}
     const float gwd_lower_bound,
     {%- endif %}
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {%- if is_index_select %}
     const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> grad_offsets,
     const bool permute_output_dim_0_1
@@ -572,6 +578,9 @@ Tensor {{ embedding_cuda_op }}(
     const bool use_uniq_cache_locations,
     const bool use_homogeneous_placements,
     {%- endif %}
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {%- if is_index_select %}
     const Tensor& grad_offsets,
     const Tensor& total_L_offsets,
@@ -1132,6 +1141,9 @@ Tensor {{ embedding_cuda_op }}(
                         {%- endif %}
                         gwd_lower_bound,
                         {%- endif %}
+                        {%- if ssd %}
+                        enable_optimizer_offloading,
+                        {%- endif %}
                         {%- if is_index_select %}
                         grad_offsets.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
                         permute_output_dim_0_1
@@ -1288,6 +1300,9 @@ Tensor {{ embedding_cuda_op }}(
                         {%- endif %}
                         gwd_lower_bound,
                         {%- endif %}
+                        {%- if ssd %}
+                        enable_optimizer_offloading,
+                        {%- endif %}
                         {%- if is_index_select %}
                         grad_offsets.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
                         permute_output_dim_0_1
@@ -1380,6 +1395,9 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
           "    bool use_uniq_cache_locations, "
           "    bool use_homogeneous_placements, "
           {%- endif %}
+          {%- if ssd %}
+          "    bool enable_optimizer_offloading, "
+          {%- endif %}
           {%- if is_gwd_kernel %}
           {%- if "prev_iter_dev" not in args.split_function_arg_names %}
           "    Tensor prev_iter_dev, "
diff --git a/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh b/fbgemm_gpu/codegen/training/optimizer/embedding_optimizer_split_device_kernel_template.cuh
@@ -55,6 +55,9 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel(
     {%- endif %}
     const uint32_t shfl_sync_mask,
     const int32_t max_vecs_per_thread,
+    {%- if ssd %}
+    const bool enable_optimizer_offloading,
+    {%- endif %}
     {{ args.split_ref_kernel_args | replace_pta_namespace() | join(",\n    ") }}
 ) {
     constexpr auto kIsInt8 = std::is_same_v<emb_t, uint8_t>;
@@ -113,6 +116,10 @@ DEVICE_INLINE void {{ mdesc }}_{{ optimizer }}_table_update_kernel(
         }
     }
 
+    {%- if not ssd %}
+    constexpr auto enable_optimizer_offloading = false;
+    {%- endif %}
+
     {{ split_precomputation }}
 
     {# /* Note: technically, global weight decay (gwd) compensation should be done before
diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_autograd_template.cpp
@@ -251,6 +251,9 @@ enum SSDTensor {
                 {%- endif %}
                 const bool /*use_uniq_cache_locations_bwd*/,
                 const bool /*use_homogeneous_placements*/,
+                {%- if ssd %}
+                const bool /*enable_optimizer_offloading*/,
+                {%- endif %}                
                 {%- if is_gwd %}
                 {%- if "prev_iter_dev" not in args_pt2.split_function_arg_names %}
                 const Tensor& /*prev_iter_dev*/,
@@ -319,6 +322,9 @@ enum SSDTensor {
           {%- if not dense %}
           use_uniq_cache_locations_bwd,
           use_homogeneous_placements,
+          {%- if ssd %}
+          enable_optimizer_offloading,
+          {%- endif %}
           {%- endif %}
           {%- if is_gwd %}
           {%- if "prev_iter_dev" not in args_pt2.split_function_arg_names %}
@@ -399,6 +405,7 @@ enum SSDTensor {
     {%- for tensor in ssd_tensors %}
     ret.push_back(Variable()); // {{ tensor }}
     {%- endfor %}
+    ret.push_back(Variable()); // enable_optimizer_offloading
     {%- endif %}
     {{ args_pt2.unified_pt2.split_variables | join("\n") }}
     return ret;
@@ -468,6 +475,7 @@ enum SSDTensor {
           aux_bool,
           {%- if ssd %}
           ssd_tensors.value(),
+          enable_optimizer_offloading,
           {%- endif  %}
           {{ args_pt2.unified_pt2.split_function_arg_names | join(", ") }}
           {%- endif %}
@@ -628,6 +636,7 @@ class {{ autograd_func }} :
     {%- endif %}
     {%- if ssd %}
     const at::TensorList& ssd_tensors,
+    const bool enable_optimizer_offloading,
     {%- endif %}
     {{ args_pt2.unified_pt2.split_function_args | join(", ") }}) {
 
@@ -817,6 +826,11 @@ class {{ autograd_func }} :
     ctx->saved_data["use_uniq_cache_locations_bwd"] = static_cast<bool>(aux_bool[IDX_USE_UNIQ_CACHE_LOCATIONS_BWD]);
     ctx->saved_data["use_homogeneous_placements"] = static_cast<bool>(aux_bool[IDX_USE_HOMOGENEOUS_PLACEMENTS]);
     {%- endif %}
+
+    {%- if ssd %}
+    ctx->saved_data["enable_optimizer_offloading"] = enable_optimizer_offloading;
+    {%- endif %}
+
     const auto iter = aux_int[IDX_ITER];
     ctx->saved_data["iter"] = iter;
     {%- if is_gwd %}
@@ -950,6 +964,11 @@ static torch::autograd::variable_list backward(
     const auto use_uniq_cache_locations_bwd = ctx->saved_data["use_uniq_cache_locations_bwd"].toBool();
     const auto use_homogeneous_placements = ctx->saved_data["use_homogeneous_placements"].toBool();
     {%- endif %}
+
+    {%- if ssd %}
+    const auto enable_optimizer_offloading = ctx->saved_data["enable_optimizer_offloading"].toBool();
+    {%- endif %}
+
     {%- if is_gwd or "iter" in args_pt2.unified_pt2.split_unpacked_arg_names %}
     const auto iter = ctx->saved_data["iter"].toInt();
     {%- endif %}
@@ -1148,7 +1167,8 @@ Tensor {{ bwd_mdesc }}_embedding_codegen_lookup_{{ optimizer }}_function_pt2(
     const c10::SymInt max_B_feature_rank = -1,
     {%- if ssd %}
     const c10::SymInt vbe_output_size = -1,
-    const std::optional<at::TensorList>& ssd_tensors = std::nullopt
+    const std::optional<at::TensorList>& ssd_tensors = std::nullopt,
+    bool enable_optimizer_offloading = false
     {%- else %}
     const c10::SymInt vbe_output_size = -1
     {%- endif %}
@@ -1242,7 +1262,8 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
         "    SymInt max_B_feature_rank=-1, "
         {%- if ssd %}
         "    SymInt vbe_output_size=-1, "
-        "    Tensor[]? ssd_tensors=None"
+        "    Tensor[]? ssd_tensors=None, "
+        "    bool enable_optimizer_offloading=False "
         {%- else %}
          "    SymInt vbe_output_size=-1 "
         {%- endif %}
diff --git a/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cuda_wrapper_template.cpp b/fbgemm_gpu/codegen/training/pt2/embedding_split_host_pt2_cuda_wrapper_template.cpp