fixes reduce_scatter function signature, refactors test and adds reduce_scatter test

allenwang28 · allenwang28 · commit a42549318a27 · 2025-02-06T13:22:01.000-08:00
diff --git a/torchft/process_group.py b/torchft/process_group.py
@@ -185,7 +185,7 @@ def broadcast_one(self, tensor: torch.Tensor, root: int) -> Work:
     def reduce_scatter(
         self,
         output_tensors: List[torch.Tensor],
-        input_tensors: List[torch.Tensor],
+        input_tensors: List[List[torch.Tensor]],
         opts: ReduceScatterOptions,
     ) -> Work:
         """
@@ -306,7 +306,7 @@ def broadcast(self, tensor_list: List[torch.Tensor], opts: object) -> Work:
     def reduce_scatter(
         self,
         output_tensors: List[torch.Tensor],
-        input_tensors: List[torch.Tensor],
+        input_tensors: List[List[torch.Tensor]],
         opts: object,
     ) -> Work:
         return self.parent.reduce_scatter(output_tensors, input_tensors, opts)
@@ -424,10 +424,10 @@ def broadcast(self, tensor_list: List[torch.Tensor], opts: object) -> Work:
     def reduce_scatter(
         self,
         output_tensors: List[torch.Tensor],
-        input_tensors: List[torch.Tensor],
+        input_tensors: List[List[torch.Tensor]],
         opts: object,
     ) -> Work:
-        for o, i in zip(output_tensors, input_tensors):
+        for o, i in zip(output_tensors, input_tensors[0]):
             o.copy_(i)
 
         res = _DummyWork(output_tensors)
@@ -1013,7 +1013,6 @@ def reduce_scatter(
             for tensor in tensor_list:
                 if not tensor.is_shared():
                     tensor.share_memory_()
-
         return self._run_func("reduce_scatter", output_tensors, input_tensors, opts)
 
     def size(self) -> int:
diff --git a/torchft/process_group_test.py b/torchft/process_group_test.py
@@ -61,6 +61,31 @@ def dummy_init_pg() -> None:
         )
 
 
+def _should_run_collective(collective_str: str, backend_str: str, device: str) -> bool:
+    """Verify if the collective is supported by the backend and device.
+
+    See https://pytorch.org/docs/stable/distributed.html#backends for the
+    supported collectives / backends / devices matrix.
+
+    """
+    if "nccl" in backend_str.lower():
+        # all collectives are supported for NCCL/CUDA but none on CPU.
+        return device == "cuda"
+    elif "gloo" in backend_str.lower():
+        if device == "cuda":
+            # GLOO/GPU only supports broadcast and all_reduce.
+            if collective_str in ["broadcast", "all_reduce"]:
+                return True
+            return False
+        else:  # cpu
+            if collective_str in ["reduce_scatter", "all_to_all"]:
+                return False
+            return True
+    else:
+        # Non defined backends (e.g. ErrorSwallowing) should continue to work.
+        return True
+
+
 def _test_pg(
     pg: ProcessGroup,
     example_tensor: torch.Tensor = torch.randn((2, 3), dtype=torch.float32),
@@ -95,10 +120,25 @@ def check_tensors(arg: Any) -> None:  # pyre-ignore[2]
         ("allgather", (output_tensors, [input_tensor], AllgatherOptions())),
         ("broadcast", (tensor_list, BroadcastOptions())),
         ("broadcast_one", (input_tensor, 0)),
-        ("reduce_scatter", (output_tensors, [input_tensor], ReduceScatterOptions())),
+        (
+            "reduce_scatter",
+            (output_tensors[0], [[input_tensor]], ReduceScatterOptions()),
+        ),
     ]
     works: Dict[str, dist._Work] = {}
+
+    try:
+        backend_str = pg.getBackendName()
+        device = example_tensor.device
+        if type(device) is torch.device:
+            device = device.type
+    except NotImplementedError as e:
+        backend_str = ""
+        device = ""
+
     for coll_str, args in collectives:
+        if not _should_run_collective(coll_str, backend_str=backend_str, device=device):
+            continue
         coll = getattr(pg, coll_str)
         work = coll(*args)
         works[coll_str] = work