@@ -226,7 +226,10 @@ std::vector<ttb_real>
226226distributeTensorToVectorsDense (const Tensor& dn_tensor_host, ttb_indx nnz,
227227 MPI_Comm comm, ttb_indx rank, ttb_indx nprocs,
228228 ttb_indx& offset) {
229- constexpr ttb_indx dt_size = sizeof (ttb_real);
229+ // Send tensor as ttb_real instead of bytes to allow for (somewhat)
230+ // larger tensors that can fit within the 32-bit MPI limit
231+ const auto mpi_dtype = DistContext::toMpiType<ttb_real>();
232+ constexpr ttb_indx dt_size = 1 ;
230233 std::vector<ttb_real> Tvec;
231234 small_vector<ttb_indx> who_gets_what =
232235 detail::singleDimUniformBlocking (nnz, nprocs);
@@ -248,7 +251,7 @@ distributeTensorToVectorsDense(const Tensor& dn_tensor_host, ttb_indx nnz,
248251 total_sent += nelements;
249252
250253 const ttb_indx index_of_first_element = who_gets_what[i];
251- MPI_Isend (Tvec.data () + index_of_first_element, nbytes, MPI_BYTE , i, i,
254+ MPI_Isend (Tvec.data () + index_of_first_element, nbytes, mpi_dtype , i, i,
252255 comm, &requests[i - 1 ]);
253256 }
254257 MPI_Waitall (requests.size (), requests.data (), statuses.data ());
@@ -271,7 +274,13 @@ distributeTensorToVectorsDense(const Tensor& dn_tensor_host, ttb_indx nnz,
271274 const ttb_indx nelements = who_gets_what[rank + 1 ] - who_gets_what[rank];
272275 Tvec.resize (nelements);
273276 const ttb_indx nbytes = nelements * dt_size;
274- MPI_Recv (Tvec.data (), nbytes, MPI_BYTE, 0 , rank, comm, MPI_STATUS_IGNORE);
277+ if (nbytes > std::numeric_limits<int >::max ()) {
278+ std::cout << " Warning on MPI processor " << rank << " :" << std::endl
279+ << " The number of receives exceeds the maximum size of a 32-bit integer." << std::endl
280+ << " This will likely fail with most MPI implementations!" << std::endl
281+ << " Try distributing your tensor across more MPI processors." << std::endl;
282+ }
283+ MPI_Recv (Tvec.data (), nbytes, mpi_dtype, 0 , rank, comm, MPI_STATUS_IGNORE);
275284 }
276285
277286 return Tvec;
0 commit comments