Skip to content

Commit f375505

Browse files
committed
add a new MPI program to evaluate MPI_Alltoallw
1 parent 5e95f0e commit f375505

File tree

2 files changed

+437
-0
lines changed

2 files changed

+437
-0
lines changed

MPI/alltoallw.c

Lines changed: 366 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,366 @@
1+
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2+
*
3+
* Copyright (C) 2025, Northwestern University
4+
* See COPYRIGHT notice in top-level directory.
5+
*
6+
* Evaluate performane of all-to-many personalized communication implemented
7+
* with MPI_Alltoallw() and MPI_Issend()/MPI_Irecv().
8+
*
9+
* To compile:
10+
* % mpicc -O2 alltoallw.c -o alltoallw
11+
*
12+
* Usage:
13+
* % ./alltoallw -h
14+
* Usage: ./alltoallw [OPTION]
15+
* [-h] Print this help message
16+
* [-v] Verbose mode (default: no)
17+
* [-n num] number of iterations (default: 1)
18+
* [-r num] every ratio processes is a receiver (default: 1)
19+
* [-l num] receive amount per iteration (default: 8 MB)
20+
* [-g num] gap between 2 consecutive send/recv buffers (default: 4 int)
21+
*
22+
* Example run command and output on screen:
23+
* % mpiexec -n 2048 ./alltoallw -n 253 -r 32
24+
*
25+
* nprocs = 2048
26+
* ntimes = 253
27+
* num_recvers = 64
28+
* individual message len = 4096 bytes
29+
* send/recv buffer gap = 4 int(s)
30+
* Recv amount per iteration = 8388608 bytes
31+
* Time for using MPI_alltoallw = 53.60 sec
32+
* Time for using MPI_Issend/Irecv = 2.59 sec
33+
*
34+
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
35+
36+
#include <stdio.h>
37+
#include <stdlib.h>
38+
#include <string.h>
39+
#include <unistd.h>
40+
#include <assert.h>
41+
42+
#include <mpi.h>
43+
44+
static int verbose;
45+
46+
#define ERR \
47+
if (err != MPI_SUCCESS) { \
48+
int errorStringLen; \
49+
char errorString[MPI_MAX_ERROR_STRING]; \
50+
MPI_Error_string(err, errorString, &errorStringLen); \
51+
printf("Error at line %d: %s\n",__LINE__,errorString); \
52+
goto err_out; \
53+
}
54+
55+
void run_alltoallw(int ntimes,
56+
int ratio,
57+
int is_receiver,
58+
int len,
59+
int gap,
60+
int *sendBuf,
61+
int *recvBuf)
62+
{
63+
int *sendPtr, *recvPtr;
64+
int i, j, err, nprocs, rank, num_recvers;
65+
int *sendCounts, *recvCounts, *sendDisps, *recvDisps;
66+
MPI_Datatype *sendTypes, *recvTypes;
67+
double timing, maxt;
68+
69+
MPI_Barrier(MPI_COMM_WORLD);
70+
timing = MPI_Wtime();
71+
72+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
73+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
74+
num_recvers = nprocs/ ratio;
75+
76+
sendTypes = (MPI_Datatype*) malloc(sizeof(MPI_Datatype) * nprocs * 2);
77+
recvTypes = sendTypes + nprocs;
78+
for (i=0; i<nprocs * 2; i++) sendTypes[i] = MPI_INT;
79+
80+
sendCounts = (int*) calloc(nprocs * 2, sizeof(int));
81+
recvCounts = sendCounts + nprocs;
82+
sendDisps = (int*) calloc(nprocs * 2, sizeof(int));
83+
recvDisps = sendDisps + nprocs;
84+
85+
sendPtr = sendBuf;
86+
recvPtr = recvBuf;
87+
88+
/* Only receivers has non-zero data to receive */
89+
if (is_receiver) {
90+
j = 0;
91+
for (i=0; i<nprocs; i++) {
92+
if (i != rank) { /* skip receiving from self */
93+
recvCounts[i] = len;
94+
recvDisps[i] = (len + gap) * j * sizeof(int);
95+
}
96+
j++;
97+
if (verbose && i != rank)
98+
printf("%2d recv from %2d of %d\n",rank,i,recvCounts[i]);
99+
}
100+
}
101+
102+
/* All ranks send to each receivers */
103+
j = 0;
104+
for (i=0; i<nprocs; i++) {
105+
if (i % ratio) continue; /* i is not a receiver */
106+
if (i != rank) { /* skip sending to self */
107+
sendCounts[i] = len;
108+
sendDisps[i] = (len + gap) * j * sizeof(int);
109+
}
110+
j++;
111+
if (verbose && i != rank)
112+
printf("%2d send to %2d of %d\n",rank,i,sendCounts[i]);
113+
}
114+
115+
for (i=0; i<ntimes; i++) {
116+
err = MPI_Alltoallw(sendPtr, sendCounts, sendDisps, sendTypes,
117+
recvPtr, recvCounts, recvDisps, recvTypes,
118+
MPI_COMM_WORLD); ERR
119+
sendPtr += num_recvers * (len + gap);
120+
recvPtr += nprocs * (len + gap);
121+
}
122+
123+
err_out:
124+
free(sendTypes);
125+
free(sendCounts);
126+
free(sendDisps);
127+
128+
timing = MPI_Wtime() - timing;
129+
MPI_Reduce(&timing, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
130+
if (rank == 0)
131+
printf("Time for using MPI_alltoallw = %.2f sec\n", maxt);
132+
}
133+
134+
void run_async_send_recv(int ntimes,
135+
int ratio,
136+
int is_receiver,
137+
int len,
138+
int gap,
139+
int *sendBuf,
140+
int *recvBuf)
141+
{
142+
int *sendPtr, *recvPtr;
143+
int i, j, err, nprocs, rank, nreqs, num_recvers;
144+
MPI_Request *reqs;
145+
MPI_Status *st;
146+
double timing, maxt;
147+
148+
MPI_Barrier(MPI_COMM_WORLD);
149+
timing = MPI_Wtime();
150+
151+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
152+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
153+
num_recvers = nprocs/ ratio;
154+
155+
/* allocate MPI_Request and MPI_Status arrays */
156+
reqs = (MPI_Request*) malloc(sizeof(MPI_Request) * (nprocs + num_recvers));
157+
st = (MPI_Status*) malloc(sizeof(MPI_Status) * (nprocs + num_recvers));
158+
159+
sendPtr = sendBuf;
160+
recvPtr = recvBuf;
161+
162+
for (i=0; i<ntimes; i++) {
163+
nreqs = 0;
164+
165+
/* Only receivers post recv requests */
166+
if (is_receiver) {
167+
for (j=0; j<nprocs; j++) {
168+
if (rank != j) { /* skip recv from self */
169+
err = MPI_Irecv(recvPtr, len, MPI_INT, j, 0, MPI_COMM_WORLD,
170+
&reqs[nreqs++]);
171+
ERR
172+
}
173+
recvPtr += len + gap;
174+
}
175+
}
176+
177+
/* all ranks post send requests */
178+
for (j=0; j<nprocs; j++) {
179+
if (j % ratio) continue; /* j is not a receiver */
180+
if (rank != j) { /* skip send to self */
181+
err = MPI_Issend(sendPtr, len, MPI_INT, j, 0, MPI_COMM_WORLD,
182+
&reqs[nreqs++]);
183+
ERR
184+
}
185+
sendPtr += len + gap;
186+
}
187+
188+
err = MPI_Waitall(nreqs, reqs, st); ERR
189+
}
190+
191+
err_out:
192+
free(st);
193+
free(reqs);
194+
195+
timing = MPI_Wtime() - timing;
196+
MPI_Reduce(&timing, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
197+
if (rank == 0)
198+
printf("Time for using MPI_Issend/Irecv = %.2f sec\n", maxt);
199+
}
200+
201+
void initialize_bufs(int ntimes,
202+
int num_recvers,
203+
int len,
204+
int gap,
205+
int *sendBuf,
206+
int *recvBuf)
207+
{
208+
int i, j, k, m, nprocs, rank;
209+
210+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
211+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
212+
213+
for (i=0; i<(len + gap)*ntimes*num_recvers; i++)
214+
sendBuf[i] = -2;
215+
for (i=0; i<(len + gap)*ntimes*nprocs; i++)
216+
recvBuf[i] = -3;
217+
m = 0;
218+
for (i=0; i<ntimes; i++) {
219+
for (j=0; j<num_recvers; j++) {
220+
for (k=0; k<len; k++) {
221+
sendBuf[m++] = rank;
222+
}
223+
m += gap;
224+
}
225+
}
226+
}
227+
228+
int check_recv_buf(char *comm_op,
229+
int ntimes,
230+
int len,
231+
int gap,
232+
int *recvBuf)
233+
{
234+
int i, j, k, m, expect, err=0, nprocs, rank;
235+
236+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
237+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
238+
239+
m = 0;
240+
for (i=0; i<ntimes; i++) {
241+
for (j=0; j<nprocs; j++) {
242+
for (k=0; k<len+gap; k++) {
243+
expect = (j == rank) ? -3 : ((k < len) ? j : -3);
244+
if (recvBuf[m] != expect) {
245+
printf("Error(%s): rank %d i=%d j=%d k=%d expect %d but got %d\n",
246+
comm_op, rank, i, j, k, expect, recvBuf[m]);
247+
goto err_out;
248+
}
249+
m++;
250+
}
251+
}
252+
}
253+
err_out:
254+
return err;
255+
}
256+
257+
/*----< usage() >------------------------------------------------------------*/
258+
static void usage (char *argv0) {
259+
char *help = "Usage: %s [OPTION]\n\
260+
[-h] Print this help message\n\
261+
[-v] Verbose mode (default: no)\n\
262+
[-n num] number of iterations (default: 1)\n\
263+
[-r num] every ratio processes is a receiver (default: 1)\n\
264+
[-l num] receive amount per iteration (default: 8 MB)\n\
265+
[-g num] gap between 2 consecutive send/recv buffers (default: 4 int)\n";
266+
fprintf (stderr, help, argv0);
267+
}
268+
269+
/*----< main() >------------------------------------------------------------*/
270+
int main(int argc, char **argv) {
271+
extern int optind;
272+
extern char *optarg;
273+
int i, rank, nprocs;
274+
int len, gap, block_len, ntimes, ratio, num_recvers, is_receiver;
275+
int *sendBuf, *recvBuf;
276+
277+
MPI_Init(&argc, &argv);
278+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
279+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
280+
281+
verbose = 0;
282+
ntimes = 1;
283+
ratio = 1;
284+
block_len = 8 * 1024 * 1024;
285+
gap = 4;
286+
287+
/* command-line arguments */
288+
while ((i = getopt (argc, argv, "hvn:r:l:g:")) != EOF)
289+
switch (i) {
290+
case 'v':
291+
verbose = 1;
292+
break;
293+
case 'n':
294+
ntimes = atoi(optarg);
295+
break;
296+
case 'r':
297+
ratio = atoi(optarg);
298+
break;
299+
case 'l':
300+
block_len = atoi(optarg);
301+
break;
302+
case 'g':
303+
gap = atoi(optarg);
304+
break;
305+
case 'h':
306+
default:
307+
if (rank == 0) usage(argv[0]);
308+
goto err_out;
309+
}
310+
311+
/* set the number of receivers */
312+
if (ratio <= 0 || ratio > nprocs) ratio = 1;
313+
num_recvers = nprocs / ratio;
314+
315+
/* set whether this rank has non-zero data to receive */
316+
is_receiver = (rank % ratio == 0) ? 1 : 0;
317+
318+
/* per message size */
319+
len = block_len / sizeof(int) / nprocs;
320+
321+
if (verbose && rank == 0)
322+
printf("nprocs=%d ntimes=%d block_len=%d num_recvers=%d len=%d gap=%d\n",
323+
nprocs, ntimes, block_len, num_recvers, len, gap);
324+
325+
if (verbose && is_receiver)
326+
printf("rank %2d is_receiver\n", rank);
327+
328+
if (verbose) fflush(stdout);
329+
330+
if (rank == 0) {
331+
printf("nprocs = %d\n", nprocs);
332+
printf("number of iterations = %d\n", ntimes);
333+
printf("numbe of receivers = %d\n", num_recvers);
334+
printf("individual message length = %zd bytes\n",len*sizeof(int));
335+
printf("send/recv buffer gap = %d int(s)\n",gap);
336+
printf("Recv amount per iteration = %d bytes\n",block_len);
337+
}
338+
339+
/* allocate and initialize send and recevive buffer */
340+
sendBuf = (int*) malloc(sizeof(int) * (len + gap) * ntimes * num_recvers);
341+
recvBuf = (int*) malloc(sizeof(int) * (len + gap) * ntimes * nprocs);
342+
343+
initialize_bufs(ntimes, num_recvers, len, gap, sendBuf, recvBuf);
344+
345+
MPI_Barrier(MPI_COMM_WORLD);
346+
run_alltoallw(ntimes, ratio, is_receiver, len, gap, sendBuf, recvBuf);
347+
348+
if (is_receiver)
349+
check_recv_buf("alltoallw", ntimes, len, gap, recvBuf);
350+
351+
initialize_bufs(ntimes, num_recvers, len, gap, sendBuf, recvBuf);
352+
353+
MPI_Barrier(MPI_COMM_WORLD);
354+
run_async_send_recv(ntimes, ratio, is_receiver, len, gap, sendBuf, recvBuf);
355+
356+
if (is_receiver)
357+
check_recv_buf("isend/irecv", ntimes, len, gap, recvBuf);
358+
359+
free(recvBuf);
360+
free(sendBuf);
361+
362+
err_out:
363+
MPI_Finalize();
364+
return 0;
365+
}
366+

0 commit comments

Comments
 (0)