|
19 | 19 | CppTransport, |
20 | 20 | CppTransportFlags, |
21 | 21 | ) |
22 | | -import mpi4py |
23 | 22 | import numpy as np |
| 23 | +import pickle |
24 | 24 |
|
25 | 25 | from mscclpp.utils import is_torch_tensor |
26 | 26 |
|
|
29 | 29 |
|
30 | 30 | class CommGroup: |
31 | 31 | def __init__( |
32 | | - self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None |
| 32 | + self, |
| 33 | + mpi_comm: "mpi4py.MPI.Comm" = None, |
| 34 | + torch_group: "dist.ProcessGroup" = None, |
| 35 | + interfaceIpPortTrio: str = "", |
| 36 | + rank: int = None, |
| 37 | + size: int = None, |
33 | 38 | ): |
34 | | - if interfaceIpPortTrio == "": |
35 | | - self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size) |
| 39 | + if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None): |
36 | 40 | uniq_id = None |
37 | | - if mpi_comm.rank == 0: |
38 | | - # similar to NCCL's unique id |
| 41 | + self.bootstrap = CppTcpBootstrap.create(rank, size) |
| 42 | + if rank == 0: |
39 | 43 | uniq_id = self.bootstrap.create_unique_id() |
40 | | - uniq_id_global = mpi_comm.bcast(uniq_id, 0) |
| 44 | + if mpi_comm is not None: |
| 45 | + import mpi4py |
| 46 | + |
| 47 | + uniq_id_global = mpi_comm.bcast(uniq_id, 0) |
| 48 | + else: |
| 49 | + import torch |
| 50 | + import torch.distributed as dist |
| 51 | + |
| 52 | + if rank == 0: |
| 53 | + uniq_id_global = uniq_id |
| 54 | + pickled_data = pickle.dumps(uniq_id) |
| 55 | + data_tensor = torch.frombuffer(bytearray(pickled_data), dtype=torch.uint8).clone() |
| 56 | + else: |
| 57 | + data_tensor = torch.zeros(256, dtype=torch.uint8) |
| 58 | + dist.broadcast(data_tensor, src=0, group=torch_group) |
| 59 | + uniq_id_global = pickle.loads(data_tensor.numpy().tobytes()) |
41 | 60 | self.bootstrap.initialize(uniq_id_global) |
42 | | - elif mpi_comm: |
43 | | - # use this instead |
44 | | - self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size) |
45 | | - self.bootstrap.initialize(interfaceIpPortTrio) |
46 | 61 | elif not interfaceIpPortTrio == "": |
47 | 62 | assert rank >= 0 and size >= 1 |
48 | 63 | self.bootstrap = CppTcpBootstrap.create(rank, size) |
|
0 commit comments