Enable Ebc Heterogenous Sharding (#2837)

faran928 · facebook-github-bot · commit e860a2c9e2f5 · 2025-03-20T01:57:57.000-07:00
Summary: Pull Request resolved: #2837 Enable Ebc Heterogenous Sharding so that a single Ebc table can be sharded across hbm and cpu Reviewed By: jiayisuse Differential Revision: D70229136 fbshipit-source-id: baf190c311df95df2c17abe0d58b86d615dd4c56
diff --git a/torchrec/distributed/embedding_lookup.py b/torchrec/distributed/embedding_lookup.py
@@ -822,25 +822,30 @@ def __init__(
         device: Optional[torch.device] = None,
         feature_processor: Optional[BaseGroupedFeatureProcessor] = None,
         fused_params: Optional[Dict[str, Any]] = None,
+        shard_index: Optional[int] = None,
     ) -> None:
         # TODO rename to _create_embedding_kernel
         def _create_lookup(
             config: GroupedEmbeddingConfig,
             device: Optional[torch.device] = None,
             fused_params: Optional[Dict[str, Any]] = None,
+            shard_index: Optional[int] = None,
         ) -> BaseBatchedEmbeddingBag[
             Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]
         ]:
             return QuantBatchedEmbeddingBag(
                 config=config,
                 device=device,
                 fused_params=fused_params,
+                shard_index=shard_index,
             )
 
         super().__init__()
         self._emb_modules: nn.ModuleList = nn.ModuleList()
         for config in grouped_configs:
-            self._emb_modules.append(_create_lookup(config, device, fused_params))
+            self._emb_modules.append(
+                _create_lookup(config, device, fused_params, shard_index)
+            )
 
         self._feature_splits: List[int] = [
             config.num_features() for config in grouped_configs
@@ -1030,6 +1035,7 @@ def __init__(
         world_size: int,
         fused_params: Optional[Dict[str, Any]] = None,
         device: Optional[torch.device] = None,
+        device_type_from_sharding_infos: Optional[Union[str, Tuple[str, ...]]] = None,
     ) -> None:
         super().__init__()
         self._embedding_lookups_per_rank: List[
@@ -1047,6 +1053,11 @@ def __init__(
         self._is_empty_rank: List[bool] = []
         for rank in range(world_size):
             empty_rank = len(grouped_configs_per_rank[rank]) == 0
+            # Propagate shard index to get the correct runtime_device based on shard metadata
+            # in case of heterogenous sharding of a single table across different device types
+            shard_index = (
+                rank if isinstance(device_type_from_sharding_infos, tuple) else None
+            )
             self._is_empty_rank.append(empty_rank)
             if not empty_rank:
                 self._embedding_lookups_per_rank.append(
@@ -1055,6 +1066,7 @@ def __init__(
                         grouped_configs=grouped_configs_per_rank[rank],
                         device=rank_device(device_type, rank),
                         fused_params=fused_params,
+                        shard_index=shard_index,
                     )
                 )
 
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -113,9 +113,27 @@ def _pin_and_move(tensor: torch.Tensor, device: torch.device) -> torch.Tensor:
     )
 
 
-def get_device_from_parameter_sharding(ps: ParameterSharding) -> str:
-    # pyre-ignore
-    return ps.sharding_spec.shards[0].placement.device().type
+def get_device_from_parameter_sharding(
+    ps: ParameterSharding,
+) -> Union[str, Tuple[str, ...]]:
+    """
+    Returns list of device type per shard if table is sharded across different
+    device type, else reutrns single device type for the table parameter
+    """
+    if not isinstance(ps.sharding_spec, EnumerableShardingSpec):
+        raise ValueError("Expected EnumerableShardingSpec as input to the function")
+
+    device_type_list: Tuple[str, ...] = tuple(
+        # pyre-fixme[16]: `Optional` has no attribute `device`
+        [shard.placement.device().type for shard in ps.sharding_spec.shards]
+    )
+    if len(set(device_type_list)) == 1:
+        return device_type_list[0]
+    else:
+        assert (
+            ps.sharding_type == "row_wise"
+        ), "Only row_wise sharding supports sharding across multiple device types for a table"
+        return device_type_list
 
 
 def replace_placement_with_meta_device(
@@ -319,7 +337,7 @@ def create_sharding_infos_by_sharding_device_group(
     prefix: str,
     fused_params: Optional[Dict[str, Any]],
     suffix: Optional[str] = "weight",
-) -> Dict[Tuple[str, str], List[EmbeddingShardingInfo]]:
+) -> Dict[Tuple[str, Union[str, Tuple[str, ...]]], List[EmbeddingShardingInfo]]:
 
     if fused_params is None:
         fused_params = {}
@@ -335,7 +353,7 @@ def create_sharding_infos_by_sharding_device_group(
                 shared_feature[feature_name] = True
 
     sharding_type_device_group_to_sharding_infos: Dict[
-        Tuple[str, str], List[EmbeddingShardingInfo]
+        Tuple[str, Union[str, Tuple[str, ...]]], List[EmbeddingShardingInfo]
     ] = {}
 
     # state_dict returns parameter.Tensor, which loses parameter level attributes
diff --git a/torchrec/distributed/quant_embedding_kernel.py b/torchrec/distributed/quant_embedding_kernel.py
@@ -232,6 +232,7 @@ def __init__(
         pg: Optional[dist.ProcessGroup] = None,
         device: Optional[torch.device] = None,
         fused_params: Optional[Dict[str, Any]] = None,
+        shard_index: Optional[int] = None,
     ) -> None:
         super().__init__(config, pg, device)
 
@@ -253,7 +254,9 @@ def __init__(
             fused_params
         )
 
-        self._runtime_device: torch.device = _get_runtime_device(device, config)
+        self._runtime_device: torch.device = _get_runtime_device(
+            device, config, shard_index
+        )
         # 16 for CUDA, 1 for others like CPU and MTIA.
         self._tbe_row_alignment: int = 16 if self._runtime_device.type == "cuda" else 1
         embedding_specs = []
diff --git a/torchrec/distributed/quant_embeddingbag.py b/torchrec/distributed/quant_embeddingbag.py
@@ -15,6 +15,8 @@
     IntNBitTableBatchedEmbeddingBagsCodegen,
 )
 from torch import nn
+
+from torch.distributed._shard.sharding_spec import EnumerableShardingSpec
 from torchrec.distributed.embedding_lookup import EmbeddingComputeKernel
 from torchrec.distributed.embedding_sharding import (
     EmbeddingSharding,
@@ -68,14 +70,33 @@
 from torchrec.sparse.jagged_tensor import KeyedJaggedTensor, KeyedTensor
 
 
-def get_device_from_parameter_sharding(ps: ParameterSharding) -> str:
-    # pyre-ignore
-    return ps.sharding_spec.shards[0].placement.device().type
+def get_device_from_parameter_sharding(
+    ps: ParameterSharding,
+) -> Union[str, Tuple[str, ...]]:
+    """
+    Returns list of device type per shard if table is sharded across
+    different device type, else reutrns single device type for the
+    table parameter.
+    """
+    if not isinstance(ps.sharding_spec, EnumerableShardingSpec):
+        raise ValueError("Expected EnumerableShardingSpec as input to the function")
+
+    device_type_list: Tuple[str, ...] = tuple(
+        # pyre-fixme[16]: `Optional` has no attribute `device`
+        [shard.placement.device().type for shard in ps.sharding_spec.shards]
+    )
+    if len(set(device_type_list)) == 1:
+        return device_type_list[0]
+    else:
+        assert (
+            ps.sharding_type == "row_wise"
+        ), "Only row_wise sharding supports sharding across multiple device types for a table"
+        return device_type_list
 
 
 def get_device_from_sharding_infos(
     emb_shard_infos: List[EmbeddingShardingInfo],
-) -> str:
+) -> Union[str, Tuple[str, ...]]:
     res = list(
         {
             get_device_from_parameter_sharding(ps.param_sharding)
@@ -86,6 +107,13 @@ def get_device_from_sharding_infos(
     return res[0]
 
 
+def get_device_for_first_shard_from_sharding_infos(
+    emb_shard_infos: List[EmbeddingShardingInfo],
+) -> str:
+    device_type = get_device_from_sharding_infos(emb_shard_infos)
+    return device_type[0] if isinstance(device_type, tuple) else device_type
+
+
 torch.fx.wrap("len")
 
 
@@ -103,13 +131,19 @@ def create_infer_embedding_bag_sharding(
     NullShardingContext, InputDistOutputs, List[torch.Tensor], torch.Tensor
 ]:
     propogate_device: bool = get_propogate_device()
+    device_type_from_sharding_infos: Union[str, Tuple[str, ...]] = (
+        get_device_from_sharding_infos(sharding_infos)
+    )
     if sharding_type == ShardingType.TABLE_WISE.value:
         return InferTwEmbeddingSharding(
             sharding_infos, env, device=device if propogate_device else None
         )
     elif sharding_type == ShardingType.ROW_WISE.value:
         return InferRwPooledEmbeddingSharding(
-            sharding_infos, env, device=device if propogate_device else None
+            sharding_infos,
+            env,
+            device=device if propogate_device else None,
+            device_type_from_sharding_infos=device_type_from_sharding_infos,
         )
     elif sharding_type == ShardingType.COLUMN_WISE.value:
         return InferCwPooledEmbeddingSharding(
@@ -148,12 +182,12 @@ def __init__(
             module.embedding_bag_configs()
         )
         self._sharding_type_device_group_to_sharding_infos: Dict[
-            Tuple[str, str], List[EmbeddingShardingInfo]
+            Tuple[str, Union[str, Tuple[str, ...]]], List[EmbeddingShardingInfo]
         ] = create_sharding_infos_by_sharding_device_group(
             module, table_name_to_parameter_sharding, "embedding_bags.", fused_params
         )
         self._sharding_type_device_group_to_sharding: Dict[
-            Tuple[str, str],
+            Tuple[str, Union[str, Tuple[str, ...]]],
             EmbeddingSharding[
                 NullShardingContext,
                 InputDistOutputs,
@@ -167,7 +201,11 @@ def __init__(
                 (
                     env
                     if not isinstance(env, Dict)
-                    else env[get_device_from_sharding_infos(embedding_configs)]
+                    else env[
+                        get_device_for_first_shard_from_sharding_infos(
+                            embedding_configs
+                        )
+                    ]
                 ),
                 device if get_propogate_device() else None,
             )
@@ -250,7 +288,7 @@ def tbes_configs(
 
     def sharding_type_device_group_to_sharding_infos(
         self,
-    ) -> Dict[Tuple[str, str], List[EmbeddingShardingInfo]]:
+    ) -> Dict[Tuple[str, Union[str, Tuple[str, ...]]], List[EmbeddingShardingInfo]]:
         return self._sharding_type_device_group_to_sharding_infos
 
     def embedding_bag_configs(self) -> List[EmbeddingBagConfig]:
@@ -329,7 +367,9 @@ def copy(self, device: torch.device) -> nn.Module:
         return super().copy(device)
 
     @property
-    def shardings(self) -> Dict[Tuple[str, str], FeatureShardingMixIn]:
+    def shardings(
+        self,
+    ) -> Dict[Tuple[str, Union[str, Tuple[str, ...]]], FeatureShardingMixIn]:
         # pyre-ignore [7]
         return self._sharding_type_device_group_to_sharding
 
@@ -552,7 +592,7 @@ class ShardedQuantEbcInputDist(torch.nn.Module):
     def __init__(
         self,
         sharding_type_device_group_to_sharding: Dict[
-            Tuple[str, str],
+            Tuple[str, Union[str, Tuple[str, ...]]],
             EmbeddingSharding[
                 NullShardingContext,
                 InputDistOutputs,
diff --git a/torchrec/distributed/sharding/rw_sharding.py b/torchrec/distributed/sharding/rw_sharding.py
@@ -752,6 +752,7 @@ def create_lookup(
             world_size=self._world_size,
             fused_params=fused_params,
             device=device if device is not None else self._device,
+            device_type_from_sharding_infos=self._device_type_from_sharding_infos,
         )
 
     def create_output_dist(

Original file line number	Diff line number	Diff line change
`@@ -752,6 +752,7 @@ def create_lookup(`
`752`	`752`	`world_size=self._world_size,`
`753`	`753`	`fused_params=fused_params,`
`754`	`754`	`device=device if device is not None else self._device,`
	`755`	`+ device_type_from_sharding_infos=self._device_type_from_sharding_infos,`
`755`	`756`	`)`
`756`	`757`
`757`	`758`	`def create_output_dist(`