Skip to content

Commit 85fc08e

Browse files
committed
add a new MPI program to evaluate MPI_Alltoallw
1 parent 5e95f0e commit 85fc08e

File tree

2 files changed

+361
-0
lines changed

2 files changed

+361
-0
lines changed

MPI/alltoallw.c

Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
2+
*
3+
* Copyright (C) 2025, Northwestern University
4+
* See COPYRIGHT notice in top-level directory.
5+
*
6+
* Evaluate performane of all-to-many personalized communication implemented
7+
* with MPI_Alltoallw() and MPI_Issend()/MPI_Irecv().
8+
*
9+
* To compile:
10+
* % mpicc -O2 alltoallw.c -o alltoallw
11+
*
12+
* Usage:
13+
* % ./alltoallw -h
14+
* Usage: ./alltoallw [OPTION]
15+
* [-h] Print this help message
16+
* [-v] Verbose mode (default: no)
17+
* [-n num] number of iterations (default: 1)
18+
* [-r ratio] every ratio processes is a receiver (default: 1)
19+
* [-l len] receive message size per iteration( default: 8MB)
20+
*
21+
* Example run command and output on screen:
22+
* % mpiexec -n 2048 ./alltoallw -n 253 -r 32
23+
*
24+
* nprocs = 2048
25+
* ntimes = 253
26+
* num_recvers = 64
27+
* individual message len = 4096 bytes
28+
* send/recv buffer gap = 4 bytes
29+
* Time for using MPI_alltoallw = 53.60 sec
30+
* Time for using MPI_Issend/Irecv = 2.59 sec
31+
*
32+
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
33+
34+
#include <stdio.h>
35+
#include <stdlib.h>
36+
#include <string.h>
37+
#include <unistd.h>
38+
#include <assert.h>
39+
40+
#include <mpi.h>
41+
42+
static int verbose;
43+
44+
#define GAP 4
45+
46+
#define ERR \
47+
if (err != MPI_SUCCESS) { \
48+
int errorStringLen; \
49+
char errorString[MPI_MAX_ERROR_STRING]; \
50+
MPI_Error_string(err, errorString, &errorStringLen); \
51+
printf("Error at line %d: %s\n",__LINE__,errorString); \
52+
goto err_out; \
53+
}
54+
55+
void run_alltoallw(int ntimes,
56+
int ratio,
57+
int is_receiver,
58+
int len,
59+
char *sendBuf,
60+
char *recvBuf)
61+
{
62+
char *sendPtr, *recvPtr;
63+
int i, j, err, nprocs, rank, num_recvers;
64+
int *sendCounts, *recvCounts, *sendDisps, *recvDisps;
65+
MPI_Datatype *sendTypes, *recvTypes;
66+
double timing, maxt;
67+
68+
MPI_Barrier(MPI_COMM_WORLD);
69+
timing = MPI_Wtime();
70+
71+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
72+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
73+
num_recvers = nprocs/ ratio;
74+
75+
sendTypes = (MPI_Datatype*) malloc(sizeof(MPI_Datatype) * nprocs * 2);
76+
recvTypes = sendTypes + nprocs;
77+
for (i=0; i<nprocs * 2; i++) sendTypes[i] = MPI_BYTE;
78+
79+
sendCounts = (int*) calloc(nprocs * 2, sizeof(int));
80+
recvCounts = sendCounts + nprocs;
81+
sendDisps = (int*) calloc(nprocs * 2, sizeof(int));
82+
recvDisps = sendDisps + nprocs;
83+
84+
sendPtr = sendBuf;
85+
recvPtr = recvBuf;
86+
87+
/* Only receivers has non-zero data to receive */
88+
if (is_receiver) {
89+
j = 0;
90+
for (i=0; i<nprocs; i++) {
91+
if (i != rank) { /* skip receiving from self */
92+
recvCounts[i] = len;
93+
recvDisps[i] = (len + GAP) * j;
94+
}
95+
j++;
96+
if (verbose && i != rank)
97+
printf("%2d recv from %2d of %d\n",rank,i,recvCounts[i]);
98+
}
99+
}
100+
101+
/* All ranks send to each receivers */
102+
j = 0;
103+
for (i=0; i<nprocs; i++) {
104+
if (i % ratio) continue; /* i is not a receiver */
105+
if (i != rank) { /* skip sending to self */
106+
sendCounts[i] = len;
107+
sendDisps[i] = (len + GAP) * j;
108+
}
109+
j++;
110+
if (verbose && i != rank)
111+
printf("%2d send to %2d of %d\n",rank,i,sendCounts[i]);
112+
}
113+
114+
for (i=0; i<ntimes; i++) {
115+
err = MPI_Alltoallw(sendPtr, sendCounts, sendDisps, sendTypes,
116+
recvPtr, recvCounts, recvDisps, recvTypes,
117+
MPI_COMM_WORLD); ERR
118+
sendPtr += num_recvers * (len + GAP);
119+
recvPtr += nprocs * (len + GAP);
120+
}
121+
122+
err_out:
123+
free(sendTypes);
124+
free(sendCounts);
125+
free(sendDisps);
126+
127+
timing = MPI_Wtime() - timing;
128+
MPI_Reduce(&timing, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
129+
if (rank == 0)
130+
printf("Time for using MPI_alltoallw = %.2f sec\n", maxt);
131+
}
132+
133+
void run_async_send_recv(int ntimes,
134+
int ratio,
135+
int is_receiver,
136+
int len,
137+
char *sendBuf,
138+
char *recvBuf)
139+
{
140+
char *sendPtr, *recvPtr;
141+
int i, j, err, nprocs, rank, nreqs, num_recvers;
142+
MPI_Request *reqs;
143+
MPI_Status *st;
144+
double timing, maxt;
145+
146+
MPI_Barrier(MPI_COMM_WORLD);
147+
timing = MPI_Wtime();
148+
149+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
150+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
151+
num_recvers = nprocs/ ratio;
152+
153+
/* allocate MPI_Request and MPI_Status arrays */
154+
reqs = (MPI_Request*) malloc(sizeof(MPI_Request) * (nprocs + num_recvers));
155+
st = (MPI_Status*) malloc(sizeof(MPI_Status) * (nprocs + num_recvers));
156+
157+
sendPtr = sendBuf;
158+
recvPtr = recvBuf;
159+
160+
for (i=0; i<ntimes; i++) {
161+
nreqs = 0;
162+
163+
/* Only receivers post recv requests */
164+
if (is_receiver) {
165+
for (j=0; j<nprocs; j++) {
166+
if (rank != j) { /* skip recv from self */
167+
err = MPI_Irecv(recvPtr, len, MPI_BYTE, j, 0, MPI_COMM_WORLD,
168+
&reqs[nreqs++]);
169+
ERR
170+
}
171+
recvPtr += len + GAP;
172+
}
173+
}
174+
175+
/* all ranks post send requests */
176+
for (j=0; j<nprocs; j++) {
177+
if (j % ratio) continue; /* j is not a receiver */
178+
if (rank != j) { /* skip send to self */
179+
err = MPI_Issend(sendPtr, len, MPI_BYTE, j, 0, MPI_COMM_WORLD,
180+
&reqs[nreqs++]);
181+
ERR
182+
}
183+
sendPtr += len + GAP;
184+
}
185+
186+
err = MPI_Waitall(nreqs, reqs, st); ERR
187+
}
188+
189+
err_out:
190+
free(st);
191+
free(reqs);
192+
193+
timing = MPI_Wtime() - timing;
194+
MPI_Reduce(&timing, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
195+
if (rank == 0)
196+
printf("Time for using MPI_Issend/Irecv = %.2f sec\n", maxt);
197+
}
198+
199+
/*----< usage() >------------------------------------------------------------*/
200+
static void usage (char *argv0) {
201+
char *help = "Usage: %s [OPTION]\n\
202+
[-h] Print this help message\n\
203+
[-v] Verbose mode (default: no)\n\
204+
[-n num] number of iterations (default: 1)\n\
205+
[-r ratio] every ratio processes is a receiver (default: 1)\n\
206+
[-l len] receive message size per iteration( default: 8MB)\n";
207+
fprintf (stderr, help, argv0);
208+
}
209+
210+
/*----< main() >------------------------------------------------------------*/
211+
int main(int argc, char **argv) {
212+
extern int optind;
213+
extern char *optarg;
214+
char *sendBuf, *recvBuf;
215+
int i, rank, nprocs;
216+
int len, block_len, ntimes, ratio, num_recvers, is_receiver;
217+
218+
MPI_Init(&argc, &argv);
219+
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
220+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
221+
222+
verbose = 0;
223+
block_len = 8 * 1024 * 1024;
224+
ntimes = 1;
225+
ratio = 1;
226+
227+
/* command-line arguments */
228+
while ((i = getopt (argc, argv, "hvl:n:r:")) != EOF)
229+
switch (i) {
230+
case 'v':
231+
verbose = 1;
232+
break;
233+
case 'l':
234+
block_len = atoi(optarg);
235+
break;
236+
case 'n':
237+
ntimes = atoi(optarg);
238+
break;
239+
case 'r':
240+
ratio = atoi(optarg);
241+
break;
242+
case 'h':
243+
default:
244+
if (rank == 0) usage(argv[0]);
245+
goto err_out;
246+
}
247+
248+
/* set the number of receivers */
249+
if (ratio <= 0 || ratio > nprocs) ratio = 1;
250+
num_recvers = nprocs / ratio;
251+
252+
/* set whether this rank has non-zero data to receive */
253+
is_receiver = (rank % ratio == 0) ? 1 : 0;
254+
255+
/* per message size */
256+
len = block_len / nprocs;
257+
258+
if (verbose && rank == 0)
259+
printf("nprocs=%d ntimes=%d block_len=%d num_recvers=%d len=%d\n",
260+
nprocs, ntimes, block_len, num_recvers, len);
261+
262+
if (verbose && is_receiver)
263+
printf("rank %2d is_receiver\n", rank);
264+
265+
if (verbose) fflush(stdout);
266+
267+
if (rank == 0) {
268+
printf("nprocs = %d\n", nprocs);
269+
printf("ntimes = %d\n", ntimes);
270+
printf("num_recvers = %d\n", num_recvers);
271+
printf("individual message len = %d bytes\n", len);
272+
printf("send/recv buffer gap = %d bytes\n", GAP);
273+
}
274+
275+
/* allocate send and recevive buffer */
276+
sendBuf = (char*) calloc(num_recvers * (len + GAP) * ntimes, 1);
277+
recvBuf = (char*) calloc(nprocs * (len + GAP) * ntimes, 1);
278+
279+
run_alltoallw(ntimes, ratio, is_receiver, len, sendBuf, recvBuf);
280+
281+
run_async_send_recv(ntimes, ratio, is_receiver, len, sendBuf, recvBuf);
282+
283+
free(recvBuf);
284+
free(sendBuf);
285+
286+
err_out:
287+
MPI_Finalize();
288+
return 0;
289+
}
290+

MPI/sbatch_perlmutter.sh

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/bash -l
2+
#SBATCH --constraint=cpu
3+
#SBATCH --qos=regular
4+
#SBATCH -t 00:10:00
5+
6+
#SBATCH --nodes=16
7+
#SBATCH --job-name=alltoallw
8+
#SBATCH -o qout.%x.%j
9+
#SBATCH -e qout.%x.%j
10+
#------------------------------------------------------------------------#
11+
cd $PWD
12+
13+
if test "x$SLURM_NTASKS_PER_NODE" = x ; then
14+
SLURM_NTASKS_PER_NODE=128
15+
fi
16+
NP=$(($SLURM_JOB_NUM_NODES * $SLURM_NTASKS_PER_NODE))
17+
18+
export FI_MR_CACHE_MONITOR=kdreg2
19+
export FI_CXI_RX_MATCH_MODE=software
20+
export MPICH_OFI_NIC_POLICY=NUMA
21+
22+
echo "------------------------------------------------------"
23+
echo "---- Running on Perlmutter CPU nodes ----"
24+
echo "---- SLURM_CLUSTER_NAME = $SLURM_CLUSTER_NAME"
25+
echo "---- SLURM_JOB_QOS = $SLURM_JOB_QOS"
26+
echo "---- SLURM_JOB_PARTITION = $SLURM_JOB_PARTITION"
27+
echo "---- SLURM_JOB_NAME = $SLURM_JOB_NAME"
28+
echo "---- SBATCH_CONSTRAINT = $SBATCH_CONSTRAINT"
29+
echo "---- SLURM_JOB_NODELIST = $SLURM_JOB_NODELIST"
30+
echo "---- SLURM_JOB_NUM_NODES = $SLURM_JOB_NUM_NODES"
31+
echo "---- SLURM_NTASKS_PER_NODE = $SLURM_NTASKS_PER_NODE"
32+
echo "---- SLURM_JOB_ID = $SLURM_JOB_ID"
33+
echo "---- SLURM out/err file = qout.$SLURM_JOB_NAME.$SLURM_JOB_ID"
34+
echo ""
35+
echo "ENV explicitly set:"
36+
echo "---- FI_MR_CACHE_MONITOR = $FI_MR_CACHE_MONITOR"
37+
echo "---- FI_UNIVERSE_SIZE = $FI_UNIVERSE_SIZE"
38+
echo "---- FI_CXI_DEFAULT_CQ_SIZE = $FI_CXI_DEFAULT_CQ_SIZE"
39+
echo "---- FI_CXI_RX_MATCH_MODE = $FI_CXI_RX_MATCH_MODE"
40+
echo "---- MPICH_COLL_SYNC = $MPICH_COLL_SYNC"
41+
echo "---- MPICH_OFI_NIC_POLICY = $MPICH_OFI_NIC_POLICY"
42+
echo "------------------------------------------------------"
43+
echo ""
44+
45+
# For fast executable loading on Cori and Perlmutter
46+
EXE_FILE=alltoallw
47+
EXE=/tmp/${USER}_${EXE_FILE}
48+
sbcast ${EXE_FILE} ${EXE}
49+
50+
echo ""
51+
echo "========================================================================"
52+
echo ""
53+
54+
NTIMES=3
55+
for ntime in $(seq 1 ${NTIMES}) ; do
56+
date
57+
echo "---- iteration $ntime -----------------------------------------------"
58+
echo ""
59+
60+
CMD_OPTS="-n 253 -r 32"
61+
62+
CMD="srun -n $NP ${EXE} $CMD_OPTS"
63+
echo "CMD=$CMD"
64+
$CMD
65+
66+
echo ""
67+
echo "====================================================================="
68+
done # loop ntimes
69+
70+
date
71+

0 commit comments

Comments
 (0)