add a new MPI program to evaluate MPI_Alltoallw

wkliao · wkliao · commit 85fc08e86d25 · 2025-01-11T20:42:38.000-06:00
diff --git a/MPI/alltoallw.c b/MPI/alltoallw.c
@@ -0,0 +1,290 @@
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ *
+ * Copyright (C) 2025, Northwestern University
+ * See COPYRIGHT notice in top-level directory.
+ *
+ * Evaluate performane of all-to-many personalized communication implemented
+ * with MPI_Alltoallw() and MPI_Issend()/MPI_Irecv().
+ *
+ * To compile:
+ *   % mpicc -O2 alltoallw.c -o alltoallw
+ *
+ * Usage:
+ * % ./alltoallw -h
+ * Usage: ./alltoallw [OPTION]
+ *        [-h] Print this help message
+ *        [-v] Verbose mode (default: no)
+ *        [-n num] number of iterations (default: 1)
+ *        [-r ratio] every ratio processes is a receiver (default: 1)
+ *        [-l len] receive message size per iteration( default: 8MB)
+ *
+ * Example run command and output on screen:
+ *   % mpiexec -n 2048 ./alltoallw -n 253 -r 32
+ *
+ *   nprocs                          = 2048
+ *   ntimes                          = 253
+ *   num_recvers                     = 64
+ *   individual message len          = 4096 bytes
+ *   send/recv buffer gap            = 4 bytes
+ *   Time for using MPI_alltoallw    = 53.60 sec
+ *   Time for using MPI_Issend/Irecv = 2.59 sec
+ *
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <assert.h>
+
+#include <mpi.h>
+
+static int verbose;
+
+#define GAP 4
+
+#define ERR \
+    if (err != MPI_SUCCESS) { \
+        int errorStringLen; \
+        char errorString[MPI_MAX_ERROR_STRING]; \
+        MPI_Error_string(err, errorString, &errorStringLen); \
+        printf("Error at line %d: %s\n",__LINE__,errorString); \
+        goto err_out; \
+    }
+
+void run_alltoallw(int   ntimes,
+                   int   ratio,
+                   int   is_receiver,
+                   int   len,
+                   char *sendBuf,
+                   char *recvBuf)
+{
+    char *sendPtr, *recvPtr;
+    int i, j, err, nprocs, rank, num_recvers;
+    int *sendCounts, *recvCounts, *sendDisps, *recvDisps;
+    MPI_Datatype *sendTypes, *recvTypes;
+    double timing, maxt;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    timing = MPI_Wtime();
+
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    num_recvers = nprocs/ ratio;
+
+    sendTypes = (MPI_Datatype*) malloc(sizeof(MPI_Datatype) * nprocs * 2);
+    recvTypes = sendTypes + nprocs;
+    for (i=0; i<nprocs * 2; i++) sendTypes[i] = MPI_BYTE;
+
+    sendCounts = (int*) calloc(nprocs * 2, sizeof(int));
+    recvCounts = sendCounts + nprocs;
+    sendDisps  = (int*) calloc(nprocs * 2, sizeof(int));
+    recvDisps  = sendDisps + nprocs;
+
+    sendPtr = sendBuf;
+    recvPtr = recvBuf;
+
+    /* Only receivers has non-zero data to receive */
+    if (is_receiver) {
+        j = 0;
+        for (i=0; i<nprocs; i++) {
+            if (i != rank) { /* skip receiving from self */
+                recvCounts[i] = len;
+                recvDisps[i] = (len + GAP) * j;
+            }
+            j++;
+            if (verbose && i != rank)
+                printf("%2d recv from %2d of %d\n",rank,i,recvCounts[i]);
+        }
+    }
+
+    /* All ranks send to each receivers */
+    j = 0;
+    for (i=0; i<nprocs; i++) {
+        if (i % ratio) continue; /* i is not a receiver */
+        if (i != rank) { /* skip sending to self */
+            sendCounts[i] = len;
+            sendDisps[i] = (len + GAP) * j;
+        }
+        j++;
+        if (verbose && i != rank)
+            printf("%2d send to %2d of %d\n",rank,i,sendCounts[i]);
+    }
+
+    for (i=0; i<ntimes; i++) {
+        err = MPI_Alltoallw(sendPtr, sendCounts, sendDisps, sendTypes,
+                            recvPtr, recvCounts, recvDisps, recvTypes,
+                            MPI_COMM_WORLD); ERR
+        sendPtr += num_recvers * (len + GAP);
+        recvPtr += nprocs * (len + GAP);
+    }
+
+err_out:
+    free(sendTypes);
+    free(sendCounts);
+    free(sendDisps);
+
+    timing = MPI_Wtime() - timing;
+    MPI_Reduce(&timing, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    if (rank == 0)
+        printf("Time for using MPI_alltoallw    = %.2f sec\n", maxt);
+}
+
+void run_async_send_recv(int   ntimes,
+                         int   ratio,
+                         int   is_receiver,
+                         int   len,
+                         char *sendBuf,
+                         char *recvBuf)
+{
+    char *sendPtr, *recvPtr;
+    int i, j, err, nprocs, rank, nreqs, num_recvers;
+    MPI_Request *reqs;
+    MPI_Status *st;
+    double timing, maxt;
+
+    MPI_Barrier(MPI_COMM_WORLD);
+    timing = MPI_Wtime();
+
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    num_recvers = nprocs/ ratio;
+
+    /* allocate MPI_Request and MPI_Status arrays */
+    reqs = (MPI_Request*) malloc(sizeof(MPI_Request) * (nprocs + num_recvers));
+    st = (MPI_Status*) malloc(sizeof(MPI_Status) * (nprocs + num_recvers));
+
+    sendPtr = sendBuf;
+    recvPtr = recvBuf;
+
+    for (i=0; i<ntimes; i++) {
+        nreqs = 0;
+
+        /* Only receivers post recv requests */
+        if (is_receiver) {
+            for (j=0; j<nprocs; j++) {
+                if (rank != j) { /* skip recv from self */
+                    err = MPI_Irecv(recvPtr, len, MPI_BYTE, j, 0, MPI_COMM_WORLD,
+                                    &reqs[nreqs++]);
+                    ERR
+                }
+                recvPtr += len + GAP;
+            }
+        }
+
+        /* all ranks post send requests */
+        for (j=0; j<nprocs; j++) {
+            if (j % ratio) continue; /* j is not a receiver */
+            if (rank != j) { /* skip send to self */
+                err = MPI_Issend(sendPtr, len, MPI_BYTE, j, 0, MPI_COMM_WORLD,
+                                 &reqs[nreqs++]);
+                ERR
+            }
+            sendPtr += len + GAP;
+        }
+
+        err = MPI_Waitall(nreqs, reqs, st); ERR
+    }
+
+err_out:
+    free(st);
+    free(reqs);
+
+    timing = MPI_Wtime() - timing;
+    MPI_Reduce(&timing, &maxt, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    if (rank == 0)
+        printf("Time for using MPI_Issend/Irecv = %.2f sec\n", maxt);
+}
+
+/*----< usage() >------------------------------------------------------------*/
+static void usage (char *argv0) {
+    char *help = "Usage: %s [OPTION]\n\
+       [-h] Print this help message\n\
+       [-v] Verbose mode (default: no)\n\
+       [-n num] number of iterations (default: 1)\n\
+       [-r ratio] every ratio processes is a receiver (default: 1)\n\
+       [-l len] receive message size per iteration( default: 8MB)\n";
+    fprintf (stderr, help, argv0);
+}
+
+/*----< main() >------------------------------------------------------------*/
+int main(int argc, char **argv) {
+    extern int optind;
+    extern char *optarg;
+    char *sendBuf, *recvBuf;
+    int i, rank, nprocs;
+    int len, block_len, ntimes, ratio, num_recvers, is_receiver;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    verbose = 0;
+    block_len = 8 * 1024 * 1024;
+    ntimes = 1;
+    ratio = 1;
+
+    /* command-line arguments */
+    while ((i = getopt (argc, argv, "hvl:n:r:")) != EOF)
+        switch (i) {
+            case 'v':
+                verbose = 1;
+                break;
+            case 'l':
+                block_len = atoi(optarg);
+                break;
+            case 'n':
+                ntimes = atoi(optarg);
+                break;
+            case 'r':
+                ratio = atoi(optarg);
+                break;
+            case 'h':
+            default:
+                if (rank == 0) usage(argv[0]);
+                goto err_out;
+        }
+
+    /* set the number of receivers */
+    if (ratio <= 0 || ratio > nprocs) ratio = 1;
+    num_recvers = nprocs / ratio;
+
+    /* set whether this rank has non-zero data to receive */
+    is_receiver = (rank % ratio == 0) ? 1 : 0;
+
+    /* per message size */
+    len = block_len / nprocs;
+
+    if (verbose && rank == 0)
+        printf("nprocs=%d ntimes=%d block_len=%d num_recvers=%d len=%d\n",
+               nprocs, ntimes, block_len, num_recvers, len);
+
+    if (verbose && is_receiver)
+        printf("rank %2d is_receiver\n", rank);
+
+    if (verbose) fflush(stdout);
+
+    if (rank == 0) {
+        printf("nprocs                          = %d\n", nprocs);
+        printf("ntimes                          = %d\n", ntimes);
+        printf("num_recvers                     = %d\n", num_recvers);
+        printf("individual message len          = %d bytes\n", len);
+        printf("send/recv buffer gap            = %d bytes\n", GAP);
+    }
+
+    /* allocate send and recevive buffer */
+    sendBuf = (char*) calloc(num_recvers * (len + GAP) * ntimes, 1);
+    recvBuf = (char*) calloc(nprocs      * (len + GAP) * ntimes, 1);
+
+    run_alltoallw(ntimes, ratio, is_receiver, len, sendBuf, recvBuf);
+
+    run_async_send_recv(ntimes, ratio, is_receiver, len, sendBuf, recvBuf);
+
+    free(recvBuf);
+    free(sendBuf);
+
+err_out:
+    MPI_Finalize();
+    return 0;
+}
+
diff --git a/MPI/sbatch_perlmutter.sh b/MPI/sbatch_perlmutter.sh
@@ -0,0 +1,71 @@
+#!/bin/bash  -l
+#SBATCH --constraint=cpu
+#SBATCH --qos=regular
+#SBATCH -t 00:10:00
+
+#SBATCH --nodes=16
+#SBATCH --job-name=alltoallw
+#SBATCH -o qout.%x.%j
+#SBATCH -e qout.%x.%j
+#------------------------------------------------------------------------#
+cd $PWD
+
+if test "x$SLURM_NTASKS_PER_NODE" = x ; then
+   SLURM_NTASKS_PER_NODE=128
+fi
+NP=$(($SLURM_JOB_NUM_NODES * $SLURM_NTASKS_PER_NODE))
+
+export FI_MR_CACHE_MONITOR=kdreg2
+export FI_CXI_RX_MATCH_MODE=software
+export MPICH_OFI_NIC_POLICY=NUMA
+
+echo "------------------------------------------------------"
+echo "---- Running on Perlmutter CPU nodes ----"
+echo "---- SLURM_CLUSTER_NAME      = $SLURM_CLUSTER_NAME"
+echo "---- SLURM_JOB_QOS           = $SLURM_JOB_QOS"
+echo "---- SLURM_JOB_PARTITION     = $SLURM_JOB_PARTITION"
+echo "---- SLURM_JOB_NAME          = $SLURM_JOB_NAME"
+echo "---- SBATCH_CONSTRAINT       = $SBATCH_CONSTRAINT"
+echo "---- SLURM_JOB_NODELIST      = $SLURM_JOB_NODELIST"
+echo "---- SLURM_JOB_NUM_NODES     = $SLURM_JOB_NUM_NODES"
+echo "---- SLURM_NTASKS_PER_NODE   = $SLURM_NTASKS_PER_NODE"
+echo "---- SLURM_JOB_ID            = $SLURM_JOB_ID"
+echo "---- SLURM out/err file      = qout.$SLURM_JOB_NAME.$SLURM_JOB_ID"
+echo ""
+echo "ENV explicitly set:"
+echo "---- FI_MR_CACHE_MONITOR     = $FI_MR_CACHE_MONITOR"
+echo "---- FI_UNIVERSE_SIZE        = $FI_UNIVERSE_SIZE"
+echo "---- FI_CXI_DEFAULT_CQ_SIZE  = $FI_CXI_DEFAULT_CQ_SIZE"
+echo "---- FI_CXI_RX_MATCH_MODE    = $FI_CXI_RX_MATCH_MODE"
+echo "---- MPICH_COLL_SYNC         = $MPICH_COLL_SYNC"
+echo "---- MPICH_OFI_NIC_POLICY    = $MPICH_OFI_NIC_POLICY"
+echo "------------------------------------------------------"
+echo ""
+
+# For fast executable loading on Cori and Perlmutter
+EXE_FILE=alltoallw
+EXE=/tmp/${USER}_${EXE_FILE}
+sbcast ${EXE_FILE} ${EXE}
+
+echo ""
+echo "========================================================================"
+echo ""
+
+NTIMES=3
+for ntime in $(seq 1 ${NTIMES}) ; do
+   date
+   echo "---- iteration $ntime -----------------------------------------------"
+   echo ""
+
+   CMD_OPTS="-n 253 -r 32"
+
+   CMD="srun -n $NP ${EXE} $CMD_OPTS"
+   echo "CMD=$CMD"
+   $CMD
+
+   echo ""
+   echo "====================================================================="
+done  # loop ntimes
+
+date
+