Fix weighted TBE inference NaN (un-init) row_weights (pytorch#4006)

sryap · facebook-github-bot · commit 9af600ef20a3 · 2025-04-24T13:50:51.000-07:00
Summary: Pull Request resolved: pytorch#4006 X-link: facebookresearch/FBGEMM#1093 This diff fixes the problem introduced by D70855331 that the `per_sample_weights` in the TBE inference kernels were not properly initialized and became NaNs, causing the embedding lookup output to contain NaNs. Reviewed By: jwfromm Differential Revision: D73387999 fbshipit-source-id: 005e41f829cd6e255d3b78a0bb06dfd491ae0ee4
diff --git a/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_kernel_template.cu b/fbgemm_gpu/codegen/inference/embedding_forward_quantized_split_nbit_kernel_template.cu
@@ -212,15 +212,12 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
             buffers[warp_idx][i][input_row_idx][row_load_idx] = data;
           }
           {% if weighted %}
-          {%- if is_rocm %}
-          if (valid && row_load_idx == 0)  {
+          if (row_load_idx == 0)  {
             // Use only one thread to load the index weight to prevent a race
             // condition when writing to the shared memory
-            buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] = indice_weights[indices_starts[i] + L_start + input_row_idx];
+            buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] =
+              valid ? indice_weights[indices_starts[i] + L_start + input_row_idx] : 0.0;
           }
-          {%- else %}
-          buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] = valid ? indice_weights[indices_starts[i] + L_start + input_row_idx] : 0.0;
-          {%- endif %}
           {% endif %}
         }
       }
@@ -255,15 +252,12 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no
           cp_async_zfill_cg<sizeof(uint4)>(&buffers[warp_idx][i][input_row_idx][row_load_idx], &row[row_load_idx], valid);
         }
         {% if weighted %}
-        {%- if is_rocm %}
-        if (valid && row_load_idx == 0) {
+        if (row_load_idx == 0) {
           // Use only one thread to load the index weight to prevent a race
           // condition when writing to the shared memory
-          buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] = indice_weights[indices_starts[i] + L_start + input_row_idx];
+          buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] =
+            valid ? indice_weights[indices_starts[i] + L_start + input_row_idx] : 0.0;
         }
-        {%- else %}
-        buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] = valid ? indice_weights[indices_starts[i] + L_start + input_row_idx] : 0.0;
-        {%- endif %}
         {% endif %}
       }
       {%- if is_rocm %}
diff --git a/fbgemm_gpu/test/tbe/inference/failures_dict_fast.json b/fbgemm_gpu/test/tbe/inference/failures_dict_fast.json
@@ -12,6 +12,7 @@
     "fbgemm::HFP8QuantizedToFloat": {},
     "fbgemm::asynchronous_complete_cumsum": {},
     "fbgemm::bounds_check_indices": {},
+    "fbgemm::check_feature_gate_key": {},
     "fbgemm::dense_embedding_codegen_lookup_function": {
       "BackwardDenseTest.test_autograd_registration__test_backward_dense": {
         "comment": "",
@@ -30,12 +31,14 @@
       }
     },
     "fbgemm::emb_inplace_update": {},
+    "fbgemm::get_infos_metadata": {},
     "fbgemm::get_unique_indices": {
       "LXUCacheTest.test_faketensor__test_unique_lxu_cache_lookup": {
         "comment": "",
         "status": "xfail"
       }
     },
+    "fbgemm::initialize_nan_shared_mem": {},
     "fbgemm::int_nbit_split_embedding_codegen_lookup_function": {
       "NBitForwardTest.test_faketensor__test_nbit_forward_fused_pooled_emb_quant": {
         "comment": "",
@@ -306,6 +309,7 @@
     "fbgemm::split_embedding_codegen_lookup_rowwise_adagrad_function_cpu": {},
     "fbgemm::split_embedding_codegen_lookup_rowwise_weighted_adagrad_function": {},
     "fbgemm::split_embedding_codegen_lookup_sgd_function": {},
-    "fbgemm::split_embedding_codegen_lookup_sgd_function_cpu": {}
+    "fbgemm::split_embedding_codegen_lookup_sgd_function_cpu": {},
+    "fbgemm::split_embedding_codegen_lookup_sgd_function_pt2": {}
   }
 }
diff --git a/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py b/fbgemm_gpu/test/tbe/inference/nbit_forward_test.py
@@ -115,6 +115,9 @@
             "Operator outputs int4 tensors which do not support opcheck tests"
         ),
     ],
+    "test_faketensor__test_nbit_forward_fused_pooled_emb_quant_nan_weighted": [
+        unittest.skip("Operator not implemented for fake tensors"),
+    ],
 }
 
 
@@ -354,6 +357,92 @@ def test_nbit_forward_fused_pooled_emb_quant_against_ref(
             **kwargs,
         )
 
+    @unittest.skipIf(*gpu_unavailable)
+    def test_nbit_forward_fused_pooled_emb_quant_nan_weighted(self) -> None:
+        # Hash size
+        E = 10
+        # Embedding dimensoin
+        D = 160
+        # Pooling factor
+        L = 64
+
+        # Use TBE training op as a reference
+        op_ref = SplitTableBatchedEmbeddingBagsCodegen(
+            [
+                (E, D, EmbeddingLocation.DEVICE, ComputeDevice.CUDA),
+            ],
+            weights_precision=SparseType.FP32,
+            output_dtype=SparseType.FP32,
+            device=torch.cuda.current_device(),
+        )
+
+        # Instantiate TBE inference
+        op = IntNBitTableBatchedEmbeddingBagsCodegen(
+            embedding_specs=[
+                (
+                    "",
+                    E,
+                    D,
+                    SparseType.INT4,
+                    EmbeddingLocation.DEVICE,
+                ),
+            ],
+            output_dtype=SparseType.FP16,
+        )
+
+        # Initialize weights_ref with 1.0
+        weights_ref = op_ref.split_embedding_weights()
+        weights_ref[0].fill_(1.0)
+
+        # Copy weights_ref to weights
+        op.initialize_weights()
+        weights = op.split_embedding_weights()
+        quant_weights, quant_scale_shift = quantize_embs(
+            weights_ref[0], SparseType.INT4
+        )
+        weights[0][0].copy_(quant_weights)
+        weights[0][1].copy_(quant_scale_shift)
+
+        # Generate inputs
+        indices = torch.as_tensor(
+            [0] * L, device=torch.cuda.current_device(), dtype=torch.int
+        )
+        offsets = torch.as_tensor(
+            [0, L], device=torch.cuda.current_device(), dtype=torch.int
+        )
+        per_sample_weights = torch.arange(
+            L, device=torch.cuda.current_device(), dtype=torch.float
+        )
+
+        # Set a bunch of indices to -1 to simulate pruning.
+        pruned_indices = indices.clone().detach()
+        prune_select = torch.arange(pruned_indices.numel()) % 8 == 0
+        pruned_indices[prune_select] = -1
+
+        # Pre-prune per_sample_weights for reference
+        pruned_per_sample_weights = per_sample_weights.clone().detach()
+        pruned_per_sample_weights[prune_select] = 0.0
+
+        # Run reference
+        output_ref = op_ref(
+            indices=indices,
+            offsets=offsets,
+            per_sample_weights=pruned_per_sample_weights,
+        )
+
+        # Initialize shared memory to NaNs.
+        torch.ops.fbgemm.initialize_nan_shared_mem(torch.cuda.current_device())
+
+        # Run test
+        output = op(
+            indices=pruned_indices,
+            offsets=offsets,
+            per_sample_weights=per_sample_weights,
+        )
+
+        # Expect the outputs to be bit-wise equivalent
+        assert torch.equal(output_ref, output)
+
     @unittest.skipIf(*gpu_unavailable)
     @given(
         T=st.integers(min_value=1, max_value=10),

Original file line number	Diff line number	Diff line change
`@@ -212,15 +212,12 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no`
`212`	`212`	`buffers[warp_idx][i][input_row_idx][row_load_idx] = data;`
`213`	`213`	`}`
`214`	`214`	`{% if weighted %}`
`215`		`- {%- if is_rocm %}`
`216`		`- if (valid && row_load_idx == 0) {`
	`215`	`+ if (row_load_idx == 0) {`
`217`	`216`	`// Use only one thread to load the index weight to prevent a race`
`218`	`217`	`// condition when writing to the shared memory`
`219`		`- buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] = indice_weights[indices_starts[i] + L_start + input_row_idx];`
	`218`	`+ buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] =`
	`219`	`+ valid ? indice_weights[indices_starts[i] + L_start + input_row_idx] : 0.0;`
`220`	`220`	`}`
`221`		`- {%- else %}`
`222`		`- buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] = valid ? indice_weights[indices_starts[i] + L_start + input_row_idx] : 0.0;`
`223`		`- {%- endif %}`
`224`	`221`	`{% endif %}`
`225`	`222`	`}`
`226`	`223`	`}`
`@@ -255,15 +252,12 @@ __global__ void {{ emb_weight_type.enum_name }}_split_embedding{{ "_nobag" if no`
`255`	`252`	`cp_async_zfill_cg<sizeof(uint4)>(&buffers[warp_idx][i][input_row_idx][row_load_idx], &row[row_load_idx], valid);`
`256`	`253`	`}`
`257`	`254`	`{% if weighted %}`
`258`		`- {%- if is_rocm %}`
`259`		`- if (valid && row_load_idx == 0) {`
	`255`	`+ if (row_load_idx == 0) {`
`260`	`256`	`// Use only one thread to load the index weight to prevent a race`
`261`	`257`	`// condition when writing to the shared memory`
`262`		`- buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] = indice_weights[indices_starts[i] + L_start + input_row_idx];`
	`258`	`+ buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] =`
	`259`	`+ valid ? indice_weights[indices_starts[i] + L_start + input_row_idx] : 0.0;`
`263`	`260`	`}`
`264`		`- {%- else %}`
`265`		`- buffers_indice_weights[warp_idx][i][input_row_idx][packed_bag_load_idx] = valid ? indice_weights[indices_starts[i] + L_start + input_row_idx] : 0.0;`
`266`		`- {%- endif %}`
`267`	`261`	`{% endif %}`
`268`	`262`	`}`
`269`	`263`	`{%- if is_rocm %}`