-
Notifications
You must be signed in to change notification settings - Fork 11
Expand file tree
/
Copy pathnccl_demo.cu
More file actions
109 lines (91 loc) · 3.68 KB
/
nccl_demo.cu
File metadata and controls
109 lines (91 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
// nccl_demo.cu — Minimal multi-GPU NCCL demo for NSys profiling.
//
// Runs AllReduce, AllGather, and ReduceScatter on 2+ GPUs so that
// NSys captures each collective type. No complex compute — the
// goal is to show NCCL operations in a merged IKP + NSys trace.
//
// Build:
// nvcc -O3 -std=c++17 -arch=sm_90a \
// -I $NCCL_HOME/include -L $NCCL_HOME/lib -lnccl \
// nccl_demo.cu -o nccl_demo
//
// Run:
// CUDA_VISIBLE_DEVICES=0,1 ./nccl_demo
#include <cuda_runtime.h>
#include <nccl.h>
#include <cstdio>
#include <cstdlib>
#include <vector>
#include <thread>
#define CK_CUDA(e) do { if ((e) != cudaSuccess) { \
fprintf(stderr, "CUDA %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(e)); exit(1); } } while(0)
#define CK_NCCL(e) do { if ((e) != ncclSuccess) { \
fprintf(stderr, "NCCL %s:%d: %s\n", __FILE__, __LINE__, ncclGetErrorString(e)); exit(1); } } while(0)
int main(int argc, char** argv) {
int ngpus = 2;
if (argc > 1) ngpus = atoi(argv[1]);
int dev_count = 0;
CK_CUDA(cudaGetDeviceCount(&dev_count));
if (ngpus > dev_count) { fprintf(stderr, "Only %d GPUs available\n", dev_count); return 1; }
const size_t N = 1 << 20; // 1M floats = 4 MB per GPU
printf("NCCL demo: %d GPUs, %zu floats (%.1f MB) per GPU\n", ngpus, N, N * 4.0 / 1e6);
// Init NCCL
std::vector<int> devs(ngpus);
for (int i = 0; i < ngpus; i++) devs[i] = i;
std::vector<ncclComm_t> comms(ngpus);
CK_NCCL(ncclCommInitAll(comms.data(), ngpus, devs.data()));
// Allocate per-GPU buffers
struct Gpu { float *send, *recv; cudaStream_t stream; };
std::vector<Gpu> gpus(ngpus);
for (int g = 0; g < ngpus; g++) {
CK_CUDA(cudaSetDevice(g));
CK_CUDA(cudaMalloc(&gpus[g].send, N * sizeof(float)));
CK_CUDA(cudaMalloc(&gpus[g].recv, N * ngpus * sizeof(float)));
CK_CUDA(cudaMemset(gpus[g].send, 1, N * sizeof(float)));
CK_CUDA(cudaStreamCreate(&gpus[g].stream));
}
// Run each collective type (all GPUs must participate together)
auto run_all = [&](const char* name, auto fn) {
printf(" %-20s", name);
std::vector<std::thread> threads;
for (int g = 0; g < ngpus; g++) {
threads.emplace_back([&, g]() {
CK_CUDA(cudaSetDevice(g));
fn(g);
CK_CUDA(cudaStreamSynchronize(gpus[g].stream));
});
}
for (auto& t : threads) t.join();
printf("done\n");
};
// Warmup
run_all("warmup", [&](int g) {
CK_NCCL(ncclAllReduce(gpus[g].send, gpus[g].send, N, ncclFloat, ncclSum, comms[g], gpus[g].stream));
});
// --- The collectives we want NSys to capture ---
run_all("AllReduce", [&](int g) {
CK_NCCL(ncclAllReduce(gpus[g].send, gpus[g].send, N, ncclFloat, ncclSum, comms[g], gpus[g].stream));
});
run_all("AllGather", [&](int g) {
CK_NCCL(ncclAllGather(gpus[g].send, gpus[g].recv, N, ncclFloat, comms[g], gpus[g].stream));
});
run_all("ReduceScatter", [&](int g) {
CK_NCCL(ncclReduceScatter(gpus[g].send, gpus[g].send, N / ngpus, ncclFloat, ncclSum, comms[g], gpus[g].stream));
});
run_all("Broadcast", [&](int g) {
CK_NCCL(ncclBroadcast(gpus[g].send, gpus[g].send, N, ncclFloat, 0, comms[g], gpus[g].stream));
});
run_all("Reduce", [&](int g) {
CK_NCCL(ncclReduce(gpus[g].send, gpus[g].send, N, ncclFloat, ncclSum, 0, comms[g], gpus[g].stream));
});
// Cleanup
for (int g = 0; g < ngpus; g++) {
CK_CUDA(cudaSetDevice(g));
cudaFree(gpus[g].send);
cudaFree(gpus[g].recv);
cudaStreamDestroy(gpus[g].stream);
ncclCommDestroy(comms[g]);
}
printf("Done.\n");
return 0;
}