Skip to content

Commit 53e909d

Browse files
authored
Merge branch 'main' into binyli/refactor
2 parents 09431c4 + 3962574 commit 53e909d

File tree

8 files changed

+34
-19
lines changed

8 files changed

+34
-19
lines changed

.azure-pipelines/multi-nodes-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ jobs:
4444
targetType: 'inline'
4545
script: |
4646
mkdir build && cd build
47-
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
47+
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
4848
make -j
4949
workingDirectory: '$(System.DefaultWorkingDirectory)'
5050

.azure-pipelines/templates/integration-test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ steps:
1919
targetType: inline
2020
script: |
2121
mkdir build && cd build
22-
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
22+
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
2323
make -j
2424
workingDirectory: '$(System.DefaultWorkingDirectory)'
2525

.azure-pipelines/templates/nccl-test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ steps:
2727
targetType: 'inline'
2828
script: |
2929
mkdir build && cd build
30-
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON ..
30+
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON ..
3131
make -j
3232
workingDirectory: '$(System.DefaultWorkingDirectory)/mscclpp'
3333

.azure-pipelines/templates/ut-no-ib-env.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ steps:
1616
targetType: 'inline'
1717
script: |
1818
mkdir build && cd build
19-
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
19+
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
2020
make -j
2121
workingDirectory: '$(System.DefaultWorkingDirectory)'
2222

.azure-pipelines/templates/ut-npkit.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ steps:
6363
set -e; \
6464
cd /root/mscclpp; \
6565
mkdir -p build && cd build; \
66-
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
66+
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} -DMSCCLPP_NPKIT_FLAGS=\"-DENABLE_NPKIT -DENABLE_NPKIT_EVENT_TIME_SYNC_CPU -DENABLE_NPKIT_EVENT_TIME_SYNC_GPU -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_INIT_EXIT -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY -DENABLE_NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT\" ..; \
6767
make -j"'
6868
kill $CHILD_PID
6969
workingDirectory: '$(System.DefaultWorkingDirectory)'

.azure-pipelines/templates/ut.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ steps:
2020
script: |
2121
mkdir build && cd build
2222
if [ "${{ parameters.platform }}" == "rocm" ]; then
23-
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
23+
CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_ROCM=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
2424
else
25-
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
25+
cmake -DCMAKE_BUILD_TYPE=Release -DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON -DMSCCLPP_BUILD_TESTS=ON -DMSCCLPP_GPU_ARCHS=${{ parameters.gpuArch }} ..
2626
fi
2727
make -j
2828
workingDirectory: '$(System.DefaultWorkingDirectory)'

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
4747

4848
# Options
4949
option(MSCCLPP_ENABLE_TRACE "Enable tracing" OFF)
50-
option(MSCCLPP_BUILD_TESTS "Build tests" ON)
50+
option(MSCCLPP_BUILD_TESTS "Build tests" OFF)
5151
option(MSCCLPP_BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
5252
option(MSCCLPP_BUILD_EXT_NCCL "Build NCCL interfaces" ON)
5353
option(MSCCLPP_BUILD_EXT_COLLECTIVES "Build collective algorithms" ON)

python/mscclpp/_core/comm.py

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
CppTransport,
2020
CppTransportFlags,
2121
)
22-
import mpi4py
2322
import numpy as np
23+
import pickle
2424

2525
from mscclpp.utils import is_torch_tensor
2626

@@ -29,20 +29,35 @@
2929

3030
class CommGroup:
3131
def __init__(
32-
self, mpi_comm: mpi4py.MPI.Comm = None, interfaceIpPortTrio: str = "", rank: int = None, size: int = None
32+
self,
33+
mpi_comm: "mpi4py.MPI.Comm" = None,
34+
torch_group: "dist.ProcessGroup" = None,
35+
interfaceIpPortTrio: str = "",
36+
rank: int = None,
37+
size: int = None,
3338
):
34-
if interfaceIpPortTrio == "":
35-
self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
39+
if interfaceIpPortTrio == "" and (mpi_comm is not None or torch_group is not None):
3640
uniq_id = None
37-
if mpi_comm.rank == 0:
38-
# similar to NCCL's unique id
41+
self.bootstrap = CppTcpBootstrap.create(rank, size)
42+
if rank == 0:
3943
uniq_id = self.bootstrap.create_unique_id()
40-
uniq_id_global = mpi_comm.bcast(uniq_id, 0)
44+
if mpi_comm is not None:
45+
import mpi4py
46+
47+
uniq_id_global = mpi_comm.bcast(uniq_id, 0)
48+
else:
49+
import torch
50+
import torch.distributed as dist
51+
52+
if rank == 0:
53+
uniq_id_global = uniq_id
54+
pickled_data = pickle.dumps(uniq_id)
55+
data_tensor = torch.frombuffer(bytearray(pickled_data), dtype=torch.uint8).clone()
56+
else:
57+
data_tensor = torch.zeros(256, dtype=torch.uint8)
58+
dist.broadcast(data_tensor, src=0, group=torch_group)
59+
uniq_id_global = pickle.loads(data_tensor.numpy().tobytes())
4160
self.bootstrap.initialize(uniq_id_global)
42-
elif mpi_comm:
43-
# use this instead
44-
self.bootstrap = CppTcpBootstrap.create(mpi_comm.rank, mpi_comm.size)
45-
self.bootstrap.initialize(interfaceIpPortTrio)
4661
elif not interfaceIpPortTrio == "":
4762
assert rank >= 0 and size >= 1
4863
self.bootstrap = CppTcpBootstrap.create(rank, size)

0 commit comments

Comments
 (0)