reland D70126859

TroyGarden · facebook-github-bot · commit 501402409c95 · 2025-03-07T09:51:53.000-08:00
Summary:
# context
* previous diff triggered S495021 
* the error message is like
```
ModelGenerationPlatformError("AttributeError: '_EmbeddingBagProxy' object has no attribute 'weight'")
```
* this diff works around the `embedding_bag.weight` to access the weight.dtype, instead, using the dtype from the table config.

Differential Revision: D70712348
diff --git a/torchrec/distributed/test_utils/test_model.py b/torchrec/distributed/test_utils/test_model.py
@@ -243,8 +243,7 @@ def _validate_pooling_factor(
             global_idlist_indices.append(indices)
             global_idlist_offsets.append(offsets)
 
-        for idx in range(len(idscore_ind_ranges)):
-            ind_range = idscore_ind_ranges[idx]
+        for idx, ind_range in enumerate(idscore_ind_ranges):
             lengths_ = torch.abs(
                 torch.randn(batch_size * world_size, device=device)
                 + (
diff --git a/torchrec/distributed/test_utils/test_sharding.py b/torchrec/distributed/test_utils/test_sharding.py
@@ -59,7 +59,11 @@
     ShardingPlan,
     ShardingType,
 )
-from torchrec.modules.embedding_configs import BaseEmbeddingConfig, EmbeddingBagConfig
+from torchrec.modules.embedding_configs import (
+    BaseEmbeddingConfig,
+    DataType,
+    EmbeddingBagConfig,
+)
 from torchrec.optim.keyed import CombinedOptimizer, KeyedOptimizerWrapper
 from torchrec.optim.optimizers import in_backward_optimizer_filter
 
@@ -554,9 +558,7 @@ def _custom_hook(input: List[torch.Tensor]) -> None:
             )
 
             # Compare predictions of sharded vs unsharded models.
-            if qcomms_config is None:
-                torch.testing.assert_close(global_pred, torch.cat(all_local_pred))
-            else:
+            if qcomms_config is not None:
                 # With quantized comms, we can relax constraints a bit
                 rtol = 0.003
                 if CommType.FP8 in [
@@ -568,6 +570,18 @@ def _custom_hook(input: List[torch.Tensor]) -> None:
                 torch.testing.assert_close(
                     global_pred, torch.cat(all_local_pred), rtol=rtol, atol=atol
                 )
+            elif (
+                weighted_tables is not None
+                and weighted_tables[0].data_type == DataType.FP16
+            ):  # https://www.internalfb.com/intern/diffing/?paste_number=1740410921
+                torch.testing.assert_close(
+                    global_pred,
+                    torch.cat(all_local_pred),
+                    atol=1e-4,  # relaxed atol due to FP16 in weights
+                    rtol=1e-4,  # relaxed rtol due to FP16 in weights
+                )
+            else:
+                torch.testing.assert_close(global_pred, torch.cat(all_local_pred))
 
 
 def create_device_mesh_for_2D(
diff --git a/torchrec/modules/embedding_modules.py b/torchrec/modules/embedding_modules.py
@@ -243,12 +243,20 @@ def forward(
         pooled_embeddings: List[torch.Tensor] = []
         feature_dict = features.to_dict()
         for i, embedding_bag in enumerate(self.embedding_bags.values()):
+            embedding_config = self._embedding_bag_configs[i]
+            dtype = (
+                torch.float32
+                if embedding_config.data_type == DataType.FP32
+                else torch.float16
+            )
             for feature_name in self._feature_names[i]:
                 f = feature_dict[feature_name]
                 res = embedding_bag(
                     input=f.values(),
                     offsets=f.offsets(),
-                    per_sample_weights=f.weights() if self._is_weighted else None,
+                    per_sample_weights=(
+                        f.weights().to(dtype) if self._is_weighted else None
+                    ),
                 ).float()
                 pooled_embeddings.append(res)
         return KeyedTensor(