Add intra_group_size to topology (meta-pytorch#3696)

Ali-Tehrani · facebook-github-bot · commit 2ac012e3c9cf · 2026-01-29T17:09:54.000-08:00
Summary:

Context
---------

Every GB200 node has 2 B200 GPU attached to it, however allows up to 72 B200 connected via NVlink. The planner needs to know how big the intra topology group size is going to be.

 This causes the `local_world_size` to be different from the `intra_group_size`.

Implementation
------------------
- Topology class:
     - Adds `pod_size`, and uses that to calculate the `intra_group_size` (maximum number of processes linked with high intra bandwidth) to Topology class. If isn't given, then it defaults to local_world_size.
- `shard_estimators.py`
     - The shard estimators now use the `intra_group_size` instead of `local_world_size`, this allows RW/TW/CW to properly account for larger NVlink that comes with the pods.

Reviewed By: isururanawaka

Differential Revision: D91617887
diff --git a/torchrec/distributed/planner/enumerators.py b/torchrec/distributed/planner/enumerators.py
@@ -88,7 +88,7 @@ def __init__(
     ) -> None:
         self._compute_device: str = topology.compute_device
         self._world_size: int = topology.world_size
-        self._local_world_size: int = topology.local_world_size
+        self._local_world_size: int = topology.intra_group_size
         self._batch_size: int = batch_size
         self._constraints = constraints
         self._sharder_map: Dict[str, ModuleSharder[nn.Module]] = {}
diff --git a/torchrec/distributed/planner/partitioners.py b/torchrec/distributed/planner/partitioners.py
@@ -302,6 +302,7 @@ def partition(
                 == PartitionByType.DEVICE.value
             ):
                 if minheap_devices is None:
+                    # Local world size should be used, since number of GPU per host/CPU
                     minheap_devices = self._establish_minheap(
                         devices, storage_constraint.local_world_size
                     )
@@ -652,11 +653,11 @@ def _cohost_partition(
     def _get_host_level_devices(
         topology: Topology, all_devices: List[DeviceHardware]
     ) -> List[List[DeviceHardware]]:
-        num_hosts: int = topology.world_size // topology.local_world_size
+        num_hosts: int = topology.world_size // topology.intra_group_size
         host_level_devices: List[List[DeviceHardware]] = []
         for i in range(num_hosts):
             devices_in_host = all_devices[
-                i * topology.local_world_size : (i + 1) * topology.local_world_size
+                i * topology.intra_group_size : (i + 1) * topology.intra_group_size
             ]
             host_level_devices.append(devices_in_host)
         return host_level_devices
diff --git a/torchrec/distributed/planner/shard_estimators.py b/torchrec/distributed/planner/shard_estimators.py
@@ -234,7 +234,7 @@ def estimate(
                 sharding_type=sharding_option.sharding_type,
                 batch_sizes=batch_sizes,
                 world_size=self._topology.world_size,
-                local_world_size=self._topology.local_world_size,
+                local_world_size=self._topology.intra_group_size,
                 input_lengths=sharding_option.input_lengths,
                 input_data_type_size=input_data_type_size,
                 table_data_type_size=table_data_type_size,
@@ -1146,7 +1146,7 @@ def estimate(
                 shard_sizes=[shard.size for shard in sharding_option.shards],
                 batch_sizes=batch_sizes,
                 world_size=self._topology.world_size,
-                local_world_size=self._topology.local_world_size,
+                local_world_size=self._topology.intra_group_size,
                 input_lengths=sharding_option.input_lengths,
                 num_poolings=num_poolings,
                 caching_ratio=caching_ratio if caching_ratio else UVM_CACHING_RATIO,
diff --git a/torchrec/distributed/planner/types.py b/torchrec/distributed/planner/types.py
@@ -286,6 +286,7 @@ def __init__(
         hbm_cap: Optional[int] = None,
         ddr_cap: Optional[int] = None,
         local_world_size: Optional[int] = None,
+        pod_size: Optional[int] = None,
         hbm_mem_bw: float = HBM_MEM_BW,
         ddr_mem_bw: float = DDR_MEM_BW,
         hbm_to_ddr_mem_bw: float = HBM_TO_DDR_MEM_BW,
@@ -310,6 +311,10 @@ def __init__(
             "cuda",
             "mtia",
         ], f"unsupported compute device {compute_device}"
+        if pod_size and pod_size > world_size:
+            raise ValueError(
+                f"pod_size={pod_size} cannot be greater than world_size={world_size}"
+            )
 
         self._compute_device = compute_device
         self._world_size = world_size
@@ -343,9 +348,19 @@ def __init__(
                 )
             )
 
+        # Local world size is the number of devices (GPUs) in a single node
         self._local_world_size: int = (
             local_world_size if local_world_size else world_size
         )
+        self._pod_size: int = pod_size
+        # Maximum numb of devices with high bandwidth interconnect (e.g. NVLink)
+        #  if pod_size isn't given, then assumes local_world_size is maximum group size
+        self._intra_group_size: int = (
+            pod_size * self._local_world_size
+            if pod_size is not None
+            else self._local_world_size
+        )
+
         self._hbm_mem_bw = hbm_mem_bw
         self._ddr_mem_bw = ddr_mem_bw
         self._hbm_to_ddr_mem_bw = hbm_to_ddr_mem_bw
@@ -381,6 +396,11 @@ def world_size(self) -> int:
     def local_world_size(self) -> int:
         return self._local_world_size
 
+    @property
+    def intra_group_size(self) -> int:
+        # The largest set of nodes connected with high intra-node bandwidth (e.g. NVLink)
+        return self._intra_group_size
+
     @property
     def hbm_mem_bw(self) -> float:
         return self._hbm_mem_bw
@@ -424,6 +444,7 @@ def __repr__(self) -> str:
         for idx, device in enumerate(self._devices):
             topology_repr += f"\tdevice {idx} {device}\n"
         topology_repr += f"local_world_size={self._local_world_size} \n"
+        topology_repr += f"intra_group_size={self._intra_group_size} \n"
         topology_repr += str(self._comms_bandwidths) + "\n"
         return topology_repr
 
@@ -449,6 +470,7 @@ def _hash(self) -> int:
             hbms,
             ddrs,
             self._local_world_size,
+            self._intra_group_size,
             self._hbm_mem_bw,
             self._ddr_mem_bw,
             self._hbm_to_ddr_mem_bw,