Skip to content

Commit b28be9f

Browse files
committed
add a new MPI program to evaluate MPI_Alltoallw
1 parent 5e95f0e commit b28be9f

File tree

2 files changed

+459
-0
lines changed

2 files changed

+459
-0
lines changed

MPI/alltoallw.c

Lines changed: 388 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,388 @@
1+
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2+
*
3+
* Copyright (C) 2025, Northwestern University
4+
* See COPYRIGHT notice in top-level directory.
5+
*
6+
* Evaluate performane of all-to-many personalized communication implemented
7+
* with MPI_Alltoallw() and MPI_Issend()/MPI_Irecv().
8+
*
9+
* To compile:
10+
* % mpicc -O2 alltoallw.c -o alltoallw
11+
*
12+
* Usage:
13+
* % ./alltoallw -h
14+
* Usage: alltoallw [OPTION]
15+
* [-h] Print this help message
16+
* [-v] Verbose mode (default: no)
17+
* [-d] Debug mode to check receive buffer contents (default: no)
18+
* [-n num] number of iterations (default: 1)
19+
* [-r num] every ratio processes is a receiver (default: 1)
20+
* [-l num] receive amount per iteration (default: 8 MB)
21+
* [-g num] gap between 2 consecutive send/recv buffers (default: 4 int)
22+
*
23+
* Example run command and output on screen:
24+
* % mpiexec -n 2048 ./alltoallw -n 253 -r 32
25+
* number of MPI processes = 2048
26+
* number of iterations = 253
27+
* numbe of receivers = 64
28+
* individual message length = 4096 bytes
29+
* send/recv buffer gap = 4 int(s)
30+
* Recv amount per iteration = 8388608 bytes
31+
* Time for using MPI_alltoallw = 53.60 sec
32+
* Time for using MPI_Issend/Irecv = 2.59 sec
33+
*
34+
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
35+
36+
#include <stdio.h>
37+
#include <stdlib.h>
38+
#include <string.h>
39+
#include <unistd.h>
40+
#include <assert.h>
41+
42+
#include <mpi.h>
43+
44+
static int verbose;
45+
static int debug;
46+
47+
#define ERR \
48+
if (err != MPI_SUCCESS) { \
49+
int errorStringLen; \
50+
char errorString[MPI_MAX_ERROR_STRING]; \
51+
MPI_Error_string(err, errorString, &errorStringLen); \
52+
printf("Error at line %d: %s\n",__LINE__,errorString); \
53+
goto err_out; \
54+
}
55+
56+
/* initilized the contents of send buffer */
57+
void initialize_send_buf(int ntimes,
58+
int num_recvers,
59+
int len,
60+
int gap,
61+
int *sendBuf)
62+
{
63+
int i, j, k, m, nprocs, rank;
64+
65+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
66+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
67+
68+
for (i=0; i<(len + gap)*ntimes*num_recvers; i++)
69+
sendBuf[i] = -2;
70+
m = 0;
71+
for (i=0; i<ntimes; i++) {
72+
for (j=0; j<num_recvers; j++) {
73+
for (k=0; k<len; k++) {
74+
sendBuf[m++] = rank;
75+
}
76+
m += gap;
77+
}
78+
}
79+
}
80+
81+
/* initilized the contents of receive buffer */
82+
void initialize_recv_buf(int len,
83+
int gap,
84+
int *recvBuf)
85+
{
86+
int i, nprocs;
87+
88+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
89+
90+
for (i=0; i<(len + gap)*nprocs; i++)
91+
recvBuf[i] = -3;
92+
}
93+
94+
/* check if the contents of receive buffer are correct */
95+
int check_recv_buf(char *comm_op,
96+
int len,
97+
int gap,
98+
int *recvBuf)
99+
{
100+
int i, j, k, expect, err=0, nprocs, rank;
101+
102+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
103+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
104+
105+
k = 0;
106+
for (i=0; i<nprocs; i++) {
107+
for (j=0; j<len+gap; j++) {
108+
expect = (i == rank) ? -3 : ((j < len) ? i : -3);
109+
if (recvBuf[k] != expect) {
110+
printf("Error(%s): rank %d i=%d j=%d expect %d but got %d\n",
111+
comm_op, rank, i, j, expect, recvBuf[k]);
112+
goto err_out;
113+
}
114+
k++;
115+
}
116+
}
117+
err_out:
118+
return err;
119+
}
120+
121+
/* all-to-many personalized communication by calling MPI_Alltoallw() */
122+
void run_alltoallw(int ntimes,
123+
int ratio,
124+
int is_receiver,
125+
int len,
126+
int gap,
127+
int *sendBuf,
128+
int *recvBuf)
129+
{
130+
int *sendPtr;
131+
int i, j, err, nprocs, rank, num_recvers;
132+
int *sendCounts, *recvCounts, *sendDisps, *recvDisps;
133+
MPI_Datatype *sendTypes, *recvTypes;
134+
double timing, maxt;
135+
136+
MPI_Barrier(MPI_COMM_WORLD);
137+
timing = MPI_Wtime();
138+
139+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
140+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
141+
num_recvers = nprocs/ ratio;
142+
143+
sendTypes = (MPI_Datatype*) malloc(sizeof(MPI_Datatype) * nprocs * 2);
144+
recvTypes = sendTypes + nprocs;
145+
for (i=0; i<nprocs * 2; i++) sendTypes[i] = MPI_INT;
146+
147+
sendCounts = (int*) calloc(nprocs * 2, sizeof(int));
148+
recvCounts = sendCounts + nprocs;
149+
sendDisps = (int*) calloc(nprocs * 2, sizeof(int));
150+
recvDisps = sendDisps + nprocs;
151+
152+
/* Only receivers has non-zero data to receive */
153+
if (is_receiver) {
154+
j = 0;
155+
for (i=0; i<nprocs; i++) {
156+
if (i != rank) { /* skip receiving from self */
157+
recvCounts[i] = len;
158+
recvDisps[i] = (len + gap) * j * sizeof(int);
159+
}
160+
j++;
161+
if (verbose && i != rank)
162+
printf("%2d recv from %2d of %d\n",rank,i,recvCounts[i]);
163+
}
164+
}
165+
166+
/* All ranks send to each receivers */
167+
j = 0;
168+
for (i=0; i<nprocs; i++) {
169+
if (i % ratio) continue; /* i is not a receiver */
170+
if (i != rank) { /* skip sending to self */
171+
sendCounts[i] = len;
172+
sendDisps[i] = (len + gap) * j * sizeof(int);
173+
}
174+
j++;
175+
if (verbose && i != rank)
176+
printf("%2d send to %2d of %d\n",rank,i,sendCounts[i]);
177+
}
178+
179+
sendPtr = sendBuf;
180+
for (i=0; i<ntimes; i++) {
181+
if (debug && is_receiver)
182+
initialize_recv_buf(len, gap, recvBuf);
183+
184+
err = MPI_Alltoallw(sendPtr, sendCounts, sendDisps, sendTypes,
185+
recvBuf, recvCounts, recvDisps, recvTypes,
186+
MPI_COMM_WORLD); ERR
187+
sendPtr += num_recvers * (len + gap);
188+
189+
if (debug && is_receiver)
190+
check_recv_buf("alltoallw", len, gap, recvBuf);
191+
}
192+
193+
err_out:
194+
free(sendTypes);
195+
free(sendCounts);
196+
free(sendDisps);
197+
198+
timing = MPI_Wtime() - timing;
199+
MPI_Reduce(&timing, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
200+
if (rank == 0)
201+
printf("Time for using MPI_alltoallw = %.2f sec\n", maxt);
202+
}
203+
204+
/* all-to-many personalized communication by calling MPI_Issend/Irecv() */
205+
void run_async_send_recv(int ntimes,
206+
int ratio,
207+
int is_receiver,
208+
int len,
209+
int gap,
210+
int *sendBuf,
211+
int *recvBuf)
212+
{
213+
int *sendPtr, *recvPtr;
214+
int i, j, err, nprocs, rank, nreqs, num_recvers;
215+
MPI_Request *reqs;
216+
MPI_Status *st;
217+
double timing, maxt;
218+
219+
MPI_Barrier(MPI_COMM_WORLD);
220+
timing = MPI_Wtime();
221+
222+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
223+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
224+
num_recvers = nprocs/ ratio;
225+
226+
/* allocate MPI_Request and MPI_Status arrays */
227+
reqs = (MPI_Request*) malloc(sizeof(MPI_Request) * (nprocs + num_recvers));
228+
st = (MPI_Status*) malloc(sizeof(MPI_Status) * (nprocs + num_recvers));
229+
230+
sendPtr = sendBuf;
231+
for (i=0; i<ntimes; i++) {
232+
if (debug && is_receiver)
233+
initialize_recv_buf(len, gap, recvBuf);
234+
235+
nreqs = 0;
236+
recvPtr = recvBuf;
237+
238+
/* Only receivers post recv requests */
239+
if (is_receiver) {
240+
for (j=0; j<nprocs; j++) {
241+
if (rank != j) { /* skip recv from self */
242+
err = MPI_Irecv(recvPtr, len, MPI_INT, j, 0, MPI_COMM_WORLD,
243+
&reqs[nreqs++]);
244+
ERR
245+
}
246+
recvPtr += len + gap;
247+
}
248+
}
249+
250+
/* all ranks post send requests */
251+
for (j=0; j<nprocs; j++) {
252+
if (j % ratio) continue; /* j is not a receiver */
253+
if (rank != j) { /* skip send to self */
254+
err = MPI_Issend(sendPtr, len, MPI_INT, j, 0, MPI_COMM_WORLD,
255+
&reqs[nreqs++]);
256+
ERR
257+
}
258+
sendPtr += len + gap;
259+
}
260+
261+
err = MPI_Waitall(nreqs, reqs, st); ERR
262+
263+
if (debug && is_receiver)
264+
check_recv_buf("issend/irecv", len, gap, recvBuf);
265+
}
266+
267+
err_out:
268+
free(st);
269+
free(reqs);
270+
271+
timing = MPI_Wtime() - timing;
272+
MPI_Reduce(&timing, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
273+
if (rank == 0)
274+
printf("Time for using MPI_Issend/Irecv = %.2f sec\n", maxt);
275+
}
276+
277+
/*----< usage() >------------------------------------------------------------*/
278+
static void usage (char *argv0) {
279+
char *help = "Usage: %s [OPTION]\n\
280+
[-h] Print this help message\n\
281+
[-v] Verbose mode (default: no)\n\
282+
[-d] Debug mode to check receive buffer contents (default: no)\n\
283+
[-n num] number of iterations (default: 1)\n\
284+
[-r num] every ratio processes is a receiver (default: 1)\n\
285+
[-l num] receive amount per iteration (default: 8 MB)\n\
286+
[-g num] gap between 2 consecutive send/recv buffers (default: 4 int)\n";
287+
fprintf (stderr, help, argv0);
288+
}
289+
290+
/*----< main() >------------------------------------------------------------*/
291+
int main(int argc, char **argv) {
292+
extern int optind;
293+
extern char *optarg;
294+
int i, rank, nprocs;
295+
int len, gap, block_len, ntimes, ratio, num_recvers, is_receiver;
296+
int *sendBuf, *recvBuf;
297+
298+
MPI_Init(&argc, &argv);
299+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
300+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
301+
302+
verbose = 0;
303+
debug = 0;
304+
ntimes = 1;
305+
ratio = 1;
306+
block_len = 8 * 1024 * 1024;
307+
gap = 4;
308+
309+
/* command-line arguments */
310+
while ((i = getopt (argc, argv, "hvdn:r:l:g:")) != EOF)
311+
switch (i) {
312+
case 'v':
313+
verbose = 1;
314+
break;
315+
case 'd':
316+
debug = 1;
317+
break;
318+
case 'n':
319+
ntimes = atoi(optarg);
320+
break;
321+
case 'r':
322+
ratio = atoi(optarg);
323+
break;
324+
case 'l':
325+
block_len = atoi(optarg);
326+
break;
327+
case 'g':
328+
gap = atoi(optarg);
329+
break;
330+
case 'h':
331+
default:
332+
if (rank == 0) usage(argv[0]);
333+
goto err_out;
334+
}
335+
336+
/* set the number of receivers */
337+
if (ratio <= 0 || ratio > nprocs) ratio = 1;
338+
num_recvers = nprocs / ratio;
339+
340+
/* set whether this rank has non-zero data to receive */
341+
is_receiver = (rank % ratio == 0) ? 1 : 0;
342+
343+
/* per message size */
344+
len = block_len / sizeof(int) / nprocs;
345+
346+
if (verbose && rank == 0)
347+
printf("nprocs=%d ntimes=%d block_len=%d num_recvers=%d len=%d gap=%d\n",
348+
nprocs, ntimes, block_len, num_recvers, len, gap);
349+
350+
if (verbose && is_receiver)
351+
printf("rank %2d is_receiver\n", rank);
352+
353+
if (verbose) fflush(stdout);
354+
355+
if (rank == 0) {
356+
printf("number of MPI processes = %d\n", nprocs);
357+
printf("number of iterations = %d\n", ntimes);
358+
printf("numbe of receivers = %d\n", num_recvers);
359+
printf("individual message length = %zd bytes\n",len*sizeof(int));
360+
printf("send/recv buffer gap = %d int(s)\n",gap);
361+
printf("Recv amount per iteration = %d bytes\n",block_len);
362+
}
363+
364+
/* allocate and initialize send buffer */
365+
sendBuf = (int*) malloc(sizeof(int) * (len + gap) * ntimes * num_recvers);
366+
initialize_send_buf(ntimes, num_recvers, len, gap, sendBuf);
367+
368+
if (is_receiver)
369+
/* receive buffer is reused every iteration */
370+
recvBuf = (int*) malloc(sizeof(int) * (len + gap) * nprocs);
371+
372+
/* perform all-to-many communication */
373+
MPI_Barrier(MPI_COMM_WORLD);
374+
run_alltoallw(ntimes, ratio, is_receiver, len, gap, sendBuf, recvBuf);
375+
376+
/* perform all-to-many communication */
377+
MPI_Barrier(MPI_COMM_WORLD);
378+
run_async_send_recv(ntimes, ratio, is_receiver, len, gap, sendBuf, recvBuf);
379+
380+
if (is_receiver)
381+
free(recvBuf);
382+
free(sendBuf);
383+
384+
err_out:
385+
MPI_Finalize();
386+
return 0;
387+
}
388+

0 commit comments

Comments
 (0)