Skip to content

Commit 1a7cb98

Browse files
authored
v0.4.3 (#279)
1 parent 5ba6ce0 commit 1a7cb98

13 files changed

+34
-16
lines changed

CITATION.cff

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
cff-version: 1.2.0
22
title: "MSCCL++: A GPU-driven communication stack for scalable AI applications"
3-
version: 0.4.2
3+
version: 0.4.3
44
message: >-
55
If you use this project in your research, please cite it as below.
66
authors:
@@ -31,6 +31,9 @@ authors:
3131
- given-names: Olli
3232
family-names: Saarikivi
3333
affiliation: Microsoft Research
34+
- given-names: Aashaka
35+
family-names: Shah
36+
affiliation: Microsoft Research
3437
- given-names: Wei
3538
family-names: Tsui
3639
affiliation: Microsoft Research

CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
set(MSCCLPP_MAJOR "0")
55
set(MSCCLPP_MINOR "4")
6-
set(MSCCLPP_PATCH "2")
6+
set(MSCCLPP_PATCH "3")
77

88
set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
99
set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")

docker/base-dev-x.dockerfile

+2-2
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
2727
ADD . /tmp/mscclpp
2828
WORKDIR /tmp/mscclpp
2929
ARG TARGET="cuda12.1"
30-
RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \
31-
python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt
30+
RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
31+
python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt
3232

3333
# Set PATH
3434
RUN echo PATH="${PATH}" > /etc/environment

docker/build.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -7,20 +7,22 @@ baseImageTable=(
77
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
88
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
99
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
10+
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
1011
)
1112

1213
declare -A extraLdPathTable
1314
extraLdPathTable=(
1415
["cuda11.8"]="/usr/local/cuda-11.8/lib64"
1516
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
1617
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
18+
["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
1719
)
1820

1921
GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
2022
TARGET=${1}
2123

2224
print_usage() {
23-
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]"
25+
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3]"
2426
}
2527

2628
if [[ ! -v "baseImageTable[${TARGET}]" ]]; then

docs/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
project = "mscclpp"
1010
copyright = "2023, MSCCL++ Team"
1111
author = "MSCCL++ Team"
12-
release = "v0.4.2"
12+
release = "v0.4.3"
1313

1414
# -- General configuration ---------------------------------------------------
1515
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

docs/quickstart.md

+8-5
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@
1111
* NVIDIA A100 GPUs + CUDA >= 11.8
1212
* NVIDIA H100 GPUs + CUDA >= 12.0
1313
* AMD MI250X GPUs + ROCm >= 5.7
14-
* AMD MI300X GPUs + ROCm >= 5.7
14+
* AMD MI300X GPUs + ROCm >= 6.0
1515
* OS: tested over Ubuntu 18.04 and 20.04
1616
* Libraries: [libnuma](https://github.com/numactl/numactl), MPI (optional)
1717
* Others
18-
* `nvidia_peermem` driver should be loaded on all nodes. Check it via:
18+
* For NVIDIA platforms, `nvidia_peermem` driver should be loaded on all nodes. Check it via:
1919
```
2020
lsmod | grep nvidia_peermem
2121
```
@@ -59,15 +59,18 @@ $ sudo make install/fast
5959
Python 3.8 or later is required.
6060

6161
```bash
62+
# For NVIDIA platforms
6263
$ python -m pip install .
64+
# For AMD platforms
65+
$ CXX=/path/to/hipcc python -m pip install .
6366
```
6467

6568
## Docker Images
6669

6770
Our base image installs all prerequisites for MSCCL++.
6871

6972
```bash
70-
$ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
73+
$ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.3
7174
```
7275

7376
See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp).
@@ -101,8 +104,8 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.
101104
[Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.
102105

103106
```bash
104-
# Choose either `requirements_cu11.txt` or `requirements_cu12.txt` according to your CUDA version.
105-
$ python3 -m pip install -r ./python/requirements_cu12.txt
107+
# Choose `requirements_*.txt` according to your CUDA/ROCm version.
108+
$ python3 -m pip install -r ./python/requirements_cuda12.txt
106109
$ mpirun -tag-output -np 8 python3 ./python/benchmark/allreduce_bench.py
107110
```
108111

include/mscclpp/core.hpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
#define MSCCLPP_MAJOR 0
88
#define MSCCLPP_MINOR 4
9-
#define MSCCLPP_PATCH 2
9+
#define MSCCLPP_PATCH 3
1010
#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)
1111

1212
#include <array>

include/mscclpp/gpu.hpp

+10
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
#if defined(__HIP_PLATFORM_AMD__)
88

9+
#include <hip/hip_bf16.h>
10+
#include <hip/hip_fp16.h>
911
#include <hip/hip_runtime.h>
1012

1113
using cudaError_t = hipError_t;
@@ -61,6 +63,8 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
6163
#define cudaMemcpy(...) hipMemcpy(__VA_ARGS__)
6264
#define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__)
6365
#define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__)
66+
#define cudaMemcpyToSymbolAsync(...) hipMemcpyToSymbolAsync(__VA_ARGS__)
67+
#define cudaStreamCreate(...) hipStreamCreate(__VA_ARGS__)
6468
#define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__)
6569
#define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__)
6670
#define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
@@ -90,6 +94,12 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
9094
#include <cuda.h>
9195
#include <cuda_fp16.h>
9296
#include <cuda_runtime.h>
97+
#if (CUDART_VERSION >= 11000)
98+
#include <cuda_bf16.h>
99+
#endif
100+
#if (CUDART_VERSION >= 11080)
101+
#include <cuda_fp8.h>
102+
#endif
93103

94104
#endif
95105

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"
77

88
[project]
99
name = "mscclpp"
10-
version = "0.4.2"
10+
version = "0.4.3"
1111

1212
[tool.scikit-build]
1313
cmake.minimum-version = "3.25.0"
File renamed without changes.
File renamed without changes.

test/deploy/setup.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
1414
done
1515

1616
if [[ "${CUDA_VERSION}" == *"11."* ]]; then
17-
pip3 install -r /root/mscclpp/python/requirements_cu11.txt
17+
pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
1818
else
19-
pip3 install -r /root/mscclpp/python/requirements_cu12.txt
19+
pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
2020
fi
2121

2222
cd /root/mscclpp && pip3 install .

test/unit/fifo_tests.cu

+1-1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ TEST(FifoTest, Fifo) {
5151
uint64_t flushCnt = 0;
5252
mscclpp::Timer timer(3);
5353
for (uint64_t i = 0; i < ITER; ++i) {
54+
trigger = hostFifo.poll();
5455
while (trigger.fst == 0 || trigger.snd == 0) {
5556
trigger = hostFifo.poll();
5657

@@ -66,7 +67,6 @@ TEST(FifoTest, Fifo) {
6667
if ((++flushCnt % hostFifo.size()) == 0) {
6768
hostFifo.flushTail();
6869
}
69-
trigger.fst = 0;
7070
spin = 0;
7171
}
7272
hostFifo.flushTail(true);

0 commit comments

Comments
 (0)