open-mpi · wenduwan · Apr 15, 2024 · Mar 27, 2024 · Apr 1, 2024 · bosilca
diff --git a/ompi/mca/coll/han/coll_han.h b/ompi/mca/coll/han/coll_han.h
@@ -331,6 +331,7 @@ typedef struct mca_coll_han_module_t {
     int *cached_topo;
     bool is_mapbycore;
     bool are_ppn_imbalanced;
+    bool is_heterogeneous;
 
     /* To be able to fallback when the cases are not supported */
     struct mca_coll_han_collectives_fallback_s fallback;

diff --git a/ompi/mca/coll/han/coll_han_gatherv.c b/ompi/mca/coll/han/coll_han_gatherv.c
@@ -149,11 +149,8 @@ int mca_coll_han_gatherv_intra(const void *sbuf, int scount, struct ompi_datatyp
                                        root_low_rank, low_comm,
                                        low_comm->c_coll->coll_gatherv_module);
 
-        size_t rdsize;
         char *tmp_rbuf = rbuf;
 
-        ompi_datatype_type_size(rdtype, &rdsize);
-
         up_rcounts = calloc(up_size, sizeof(int));
         up_displs = malloc(up_size * sizeof(int));
         up_peer_ub = calloc(up_size, sizeof(int));
@@ -210,7 +207,9 @@ int mca_coll_han_gatherv_intra(const void *sbuf, int scount, struct ompi_datatyp
         }
 
         if (need_bounce_buf) {
-            bounce_buf = malloc(rdsize * total_up_rcounts);
+            ptrdiff_t rsize, rgap;
+            rsize = opal_datatype_span(&rdtype->super, total_up_rcounts, &rgap);
+            bounce_buf = malloc(rsize);
             if (!bounce_buf) {
                 err = OMPI_ERR_OUT_OF_RESOURCE;
                 goto root_out;
@@ -222,7 +221,7 @@ int mca_coll_han_gatherv_intra(const void *sbuf, int scount, struct ompi_datatyp
                                                  : 0;
             }
 
-            tmp_rbuf = bounce_buf;
+            tmp_rbuf = bounce_buf - rgap;
         }
 
         /* Up Gatherv */
@@ -231,7 +230,8 @@ int mca_coll_han_gatherv_intra(const void *sbuf, int scount, struct ompi_datatyp
 
         /* Use a temp buffer to reorder the output buffer if needed */
         if (need_bounce_buf) {
-            ptrdiff_t offset = 0;
+            ptrdiff_t offset = 0, rdext;
+            ompi_datatype_type_extent(rdtype, &rdext);
 
             for (int i = 0; i < w_size; ++i) {
                 up_peer = topo[2 * i];
@@ -242,10 +242,9 @@ int mca_coll_han_gatherv_intra(const void *sbuf, int scount, struct ompi_datatyp
                 w_peer = topo[2 * i + 1];
 
                 ompi_datatype_copy_content_same_ddt(rdtype, (size_t) rcounts[w_peer],
-                                                    (char *) rbuf
-                                                        + (size_t) displs[w_peer] * rdsize,
+                                                    (char *) rbuf + (size_t) displs[w_peer] * rdext,
                                                     bounce_buf + offset);
-                offset += rdsize * (size_t) rcounts[w_peer];
+                offset += rdext * (size_t) rcounts[w_peer];
             }
         }
 

diff --git a/ompi/mca/coll/han/coll_han_scatterv.c b/ompi/mca/coll/han/coll_han_scatterv.c
@@ -55,6 +55,12 @@
  *    to send the data in the correct order even if the process are NOT mapped by core.
  * 2. In the send buffer, other than the root's node, data destined to the same node are continuous
  *    - it is ok if data to different nodes has gap.
+ *
+ * Limitation:
+ * The node leader acts as a broker between the Root and node followers, but it cannot match the
+ * exact type signature of the followers; instead it forwards the intermediate data from Root in its
+ * packed form of MPI_BYTE type. This works for Gatherv but NOT for Scatterv provided that the Root
+ * has a different architecture, e.g. endianness, integer representation, etc.
  */
 int mca_coll_han_scatterv_intra(const void *sbuf, const int *scounts, const int *displs,
                                 struct ompi_datatype_t *sdtype, void *rbuf, int rcount,
@@ -94,6 +100,14 @@ int mca_coll_han_scatterv_intra(const void *sbuf, const int *scounts, const int
         return han_module->previous_scatterv(sbuf, scounts, displs, sdtype, rbuf, rcount, rdtype,
                                              root, comm, han_module->previous_scatterv_module);
     }
+    if (han_module->is_heterogeneous) {
+        OPAL_OUTPUT_VERBOSE((30, mca_coll_han_component.han_output,
+                             "han cannot handle scatterv with this communicator (heterogeneous). Fall "
+                             "back on another component\n"));
+        HAN_LOAD_FALLBACK_COLLECTIVE(han_module, comm, scatterv);
+        return han_module->previous_scatterv(sbuf, scounts, displs, sdtype, rbuf, rcount, rdtype,
+                                             root, comm, han_module->previous_scatterv_module);
+    }
 
     w_rank = ompi_comm_rank(comm);
     w_size = ompi_comm_size(comm);
@@ -125,7 +139,6 @@ int mca_coll_han_scatterv_intra(const void *sbuf, const int *scounts, const int
         int need_bounce_buf = 0, total_up_scounts = 0, *up_displs = NULL, *up_scounts = NULL,
             *up_peer_lb = NULL, *up_peer_ub = NULL;
         char *reorder_sbuf = (char *) sbuf, *bounce_buf = NULL;
-        size_t sdsize;
 
         low_scounts = malloc(low_size * sizeof(int));
         low_displs = malloc(low_size * sizeof(int));
@@ -144,8 +157,6 @@ int mca_coll_han_scatterv_intra(const void *sbuf, const int *scounts, const int
             low_scounts[low_peer] = scounts[w_peer];
         }
 
-        ompi_datatype_type_size(sdtype, &sdsize);
-
         up_scounts = calloc(up_size, sizeof(int));
         up_displs = malloc(up_size * sizeof(int));
         up_peer_ub = calloc(up_size, sizeof(int));
@@ -201,11 +212,14 @@ int mca_coll_han_scatterv_intra(const void *sbuf, const int *scounts, const int
         }
 
         if (need_bounce_buf) {
-            bounce_buf = malloc(sdsize * total_up_scounts);
+            ptrdiff_t ssize, sgap;
+            ssize = opal_datatype_span(&rdtype->super, total_up_scounts, &sgap);
+            bounce_buf = malloc(ssize);
             if (!bounce_buf) {
                 err = OMPI_ERR_OUT_OF_RESOURCE;
                 goto root_out;
             }
+            reorder_sbuf = bounce_buf - sgap;
 
             /* Calculate displacements for the inter-node scatterv */
             for (up_peer = 0; up_peer < up_size; ++up_peer) {
@@ -214,7 +228,8 @@ int mca_coll_han_scatterv_intra(const void *sbuf, const int *scounts, const int
             }
 
             /* Use a temp buffer to reorder the send buffer if needed */
-            ptrdiff_t offset = 0;
+            ptrdiff_t offset = 0, sdext;
+            ompi_datatype_type_extent(sdtype, &sdext);
 
             for (int i = 0; i < w_size; ++i) {
                 up_peer = topo[2 * i];
@@ -225,13 +240,11 @@ int mca_coll_han_scatterv_intra(const void *sbuf, const int *scounts, const int
                 w_peer = topo[2 * i + 1];
 
                 ompi_datatype_copy_content_same_ddt(sdtype, (size_t) scounts[w_peer],
-                                                    bounce_buf + offset,
+                                                    reorder_sbuf + offset,
                                                     (char *) sbuf
-                                                        + (size_t) displs[w_peer] * sdsize);
-                offset += sdsize * (size_t) scounts[w_peer];
+                                                        + (size_t) displs[w_peer] * sdext);
+                offset += sdext * (size_t) scounts[w_peer];
             }
-
-            reorder_sbuf = bounce_buf;
         }
 
         /* Up Iscatterv */

diff --git a/ompi/mca/coll/han/coll_han_topo.c b/ompi/mca/coll/han/coll_han_topo.c
@@ -92,12 +92,19 @@ mca_coll_han_topo_init(struct ompi_communicator_t *comm,
     }
     assert(up_comm != NULL && low_comm != NULL);
 
+    int up_rank = ompi_comm_rank(up_comm);
     int low_rank = ompi_comm_rank(low_comm);
     int low_size = ompi_comm_size(low_comm);
 
+    ompi_proc_t *up_proc = NULL;
+
     int *topo = (int *)malloc(sizeof(int) * size * num_topo_level);
-    int is_imbalanced = 1;
-    int ranks_non_consecutive = 0;
+    int is_imbalanced = 1, ranks_non_consecutive = 0, is_heterogeneous = 0;
+
+    if (0 != up_rank) {
+        up_proc = ompi_comm_peer_lookup(up_comm, 0);
+        is_heterogeneous = up_proc->super.proc_convertor->remoteArch != opal_local_arch;
+    }
 
     /* node leaders translate the node-local ranks to global ranks and check whether they are placed consecutively */
     if (0 == low_rank) {
@@ -116,15 +123,16 @@ mca_coll_han_topo_init(struct ompi_communicator_t *comm,
             }
         }
 
-        int reduce_vals[] = {ranks_non_consecutive, low_size, -low_size};
+        int reduce_vals[] = {ranks_non_consecutive, low_size, -low_size, is_heterogeneous};
 
-        up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, &reduce_vals, 3,
+        up_comm->c_coll->coll_allreduce(MPI_IN_PLACE, &reduce_vals, 4,
                                         MPI_INT, MPI_MAX, up_comm,
                                         up_comm->c_coll->coll_allreduce_module);
 
         /* is the distribution of processes balanced per node? */
         is_imbalanced = (reduce_vals[1] == -reduce_vals[2]) ? 0 : 1;
         ranks_non_consecutive = reduce_vals[0];
+        is_heterogeneous = reduce_vals[3];
 
         if ( ranks_non_consecutive && !is_imbalanced ) {
             /* kick off up_comm allgather to collect non-consecutive rank information at node leaders */
@@ -136,12 +144,13 @@ mca_coll_han_topo_init(struct ompi_communicator_t *comm,
     }
 
 
-    /* broadcast balanced and consecutive properties from node leaders to remaining ranks */
-    int bcast_vals[] = {is_imbalanced, ranks_non_consecutive};
-    low_comm->c_coll->coll_bcast(bcast_vals, 2, MPI_INT, 0,
+    /* broadcast balanced, consecutive and homogeneity properties from node leaders to remaining ranks */
+    int bcast_vals[] = {is_imbalanced, ranks_non_consecutive, is_heterogeneous};
+    low_comm->c_coll->coll_bcast(bcast_vals, 3, MPI_INT, 0,
                                  low_comm, low_comm->c_coll->coll_bcast_module);
     is_imbalanced = bcast_vals[0];
     ranks_non_consecutive = bcast_vals[1];
+    han_module->is_heterogeneous = bcast_vals[2];
 
     /* error out if the rank distribution is not balanced */
     if (is_imbalanced) {