[Distributed] Add send and recv helpers (vllm-project#5719)

andoorve · web-flow · commit 5d4d90536fa2 · 2024-06-23T14:42:28.000-07:00
diff --git a/tests/distributed/test_comm_ops.py b/tests/distributed/test_comm_ops.py
@@ -8,12 +8,11 @@
 import ray
 import torch
 
-from vllm.distributed import (broadcast_tensor_dict,
+from vllm.distributed import (broadcast_tensor_dict, get_pp_group,
                               tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
 
-from ..utils import (init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+from ..utils import init_test_distributed_environment, multi_process_parallel
 
 
 @ray.remote(num_gpus=1, max_calls=1)
@@ -105,6 +104,68 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
         assert torch.allclose(recv_dict["f"], test_dict["f"])
 
 
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
+                                      distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    test_dict = {
+        # device tensor
+        "a": torch.arange(8, dtype=torch.float32, device="cuda"),
+        # CPU tensor
+        "b": torch.arange(16, dtype=torch.int8, device="cpu"),
+        "c": "test",
+        "d": [1, 2, 3],
+        "e": {
+            "a": 1,
+            "b": 2
+        },
+        # empty tensor
+        "f": torch.tensor([], dtype=torch.float32, device="cuda"),
+    }
+
+    if not get_pp_group().is_first_rank:
+        recv_dict = get_pp_group().recv_tensor_dict()
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send_tensor_dict(test_dict)
+
+    if not get_pp_group().is_first_rank:
+        assert len(recv_dict) == len(test_dict)
+        assert torch.allclose(recv_dict["a"], test_dict["a"])
+        assert torch.allclose(recv_dict["b"], test_dict["b"])
+        assert recv_dict["c"] == test_dict["c"]
+        assert recv_dict["d"] == test_dict["d"]
+        assert recv_dict["e"] == test_dict["e"]
+        assert torch.allclose(recv_dict["f"], test_dict["f"])
+
+
+@ray.remote(num_gpus=1, max_calls=1)
+def send_recv_test_worker(tp_size: int, pp_size: int, rank: int,
+                          distributed_init_port: str):
+    del os.environ["CUDA_VISIBLE_DEVICES"]
+    device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    init_test_distributed_environment(tp_size, pp_size, rank,
+                                      distributed_init_port)
+
+    size = 64
+    test_tensor = torch.arange(64, dtype=torch.float32, device="cuda")
+
+    if not get_pp_group().is_first_rank:
+        recv_tensor = get_pp_group().recv(size, dtype=torch.float32)
+
+    if not get_pp_group().is_last_rank:
+        get_pp_group().send(test_tensor)
+
+    if not get_pp_group().is_first_rank:
+        assert torch.allclose(test_tensor, recv_tensor)
+
+
 @pytest.mark.skipif(torch.cuda.device_count() < 2,
                     reason="Need at least 2 GPUs to run the test.")
 @pytest.mark.parametrize("tp_size", [2])
@@ -113,4 +174,13 @@ def broadcast_tensor_dict_test_worker(tp_size: int, pp_size: int, rank: int,
     broadcast_tensor_dict_test_worker
 ])
 def test_multi_process_tensor_parallel(tp_size, test_target):
-    multi_process_tensor_parallel(tp_size, 1, test_target)
+    multi_process_parallel(tp_size, 1, test_target)
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize("pp_size", [2])
+@pytest.mark.parametrize(
+    "test_target", [send_recv_test_worker, send_recv_tensor_dict_test_worker])
+def test_multi_process_pipeline_parallel(pp_size, test_target):
+    multi_process_parallel(1, pp_size, test_target)
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
@@ -12,8 +12,7 @@
                                              get_tp_group, graph_capture)
 
 from ..utils import (ensure_model_parallel_initialized,
-                     init_test_distributed_environment,
-                     multi_process_tensor_parallel)
+                     init_test_distributed_environment, multi_process_parallel)
 
 random.seed(42)
 test_sizes = [random.randint(1024, 2048 * 1024) for _ in range(8)]
@@ -113,4 +112,4 @@ def test_custom_allreduce(tp_size, pipeline_parallel_size, test_target):
     world_size = tp_size * pipeline_parallel_size
     if world_size > torch.cuda.device_count():
         pytest.skip("Not enough GPUs to run the test.")
-    multi_process_tensor_parallel(tp_size, pipeline_parallel_size, test_target)
+    multi_process_parallel(tp_size, pipeline_parallel_size, test_target)
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
@@ -168,9 +168,13 @@ def send_recv_worker_fn():
                              dtype=torch.float32).cuda(pynccl_comm.rank)
     with pynccl_comm.change_state(enable=True):
         if pynccl_comm.rank == 0:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
         else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
     result = tensor.mean().cpu().item()
     assert result == 1
 
@@ -203,9 +207,13 @@ def multiple_send_recv_worker_fn():
                              device=device)
     with pynccl_comm.change_state(enable=True):
         if torch.distributed.get_rank() in [0, 1]:
-            pynccl_comm.send(tensor)
+            pynccl_comm.send(tensor,
+                             dst=(pynccl_comm.rank + 1) %
+                             pynccl_comm.world_size)
         else:
-            pynccl_comm.recv(tensor)
+            pynccl_comm.recv(tensor,
+                             src=(pynccl_comm.rank - 1) %
+                             pynccl_comm.world_size)
     result = tensor.mean().cpu().item()
     if torch.distributed.get_rank() in [0, 2]:
         assert result == 1
diff --git a/tests/utils.py b/tests/utils.py
@@ -129,7 +129,7 @@ def init_test_distributed_environment(
     ensure_model_parallel_initialized(tp_size, pp_size)
 
 
-def multi_process_tensor_parallel(
+def multi_process_parallel(
     tp_size: int,
     pp_size: int,
     test_target,
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
@@ -121,36 +121,26 @@ def all_reduce(self,
                                 ncclRedOpTypeEnum.from_torch(op), self.comm,
                                 cudaStream_t(stream.cuda_stream))
 
-    def send(self,
-             tensor: torch.Tensor,
-             dst: Optional[int] = None,
-             stream=None):
+    def send(self, tensor: torch.Tensor, dst: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        if dst is None:
-            dst = (self.rank + 1) % self.world_size
         self.nccl.ncclSend(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), dst,
                            self.comm, cudaStream_t(stream.cuda_stream))
 
-    def recv(self,
-             tensor: torch.Tensor,
-             src: Optional[int] = None,
-             stream=None):
+    def recv(self, tensor: torch.Tensor, src: int, stream=None):
         if self.disabled:
             return
         assert tensor.device == self.device, (
             f"this nccl communicator is created to work on {self.device}, "
             f"but the input tensor is on {tensor.device}")
         if stream is None:
             stream = self.stream
-        if src is None:
-            src = (self.rank - 1) % self.world_size
         self.nccl.ncclRecv(buffer_type(tensor.data_ptr()), tensor.numel(),
                            ncclDataTypeEnum.from_torch(tensor.dtype), src,
                            self.comm, cudaStream_t(stream.cuda_stream))
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py