Communications arena implementation (#3388)

mukul1992 · WeiqunZhang · web-flow · commit 4a4e8b3fb1b9 · 2023-06-28T09:04:56.000-07:00
## Summary Implement a communications arena for comm buffers to replace `the_fa_arena`. It creates a separate arena when GPU-aware MPI is used and `the_arena` is not managed. ## Additional background The motivation for this is a communication performance degradation that is observed for GPU-aware MPI with `amrex.the_arena_is_managed=0`. @WeiqunZhang has a hypothesis that this may be due to the need for frequent re-registering of comm buffer pointers when using the same device arena as the other compute data. Hence a separate arena in this case would alleviate this issue. `the_fa_arena` is eliminated in this PR and the communication buffer directly uses `the_comms_arena` to simplify the code. ## Performance tests The above stated performance degradation is particularly observed with the `GPU/CNS/Exec/Sod` code under `Tests` and is alleviated by using a separate comms arena as seen in the performance data below. `original` refers to the state before we made the change in #3362 related to `the_fa_arena` pointing to the device arena which allowed `amrex.the_arena_is_managed=1` with GPU-aware MPI without a significant performance hit. It is compared with the current development branch and the proposed comms arena implementation. The data pointing to the performance improvement from this PR is highlighted. ![Screenshot 2023-06-27 at 4 11 54 PM](https://github.com/AMReX-Codes/amrex/assets/18251677/ae16b822-0178-4679-a90f-255cad6c5451) In other tests such as the `ABecLaplacian` linear solve or the ERF code, using `amrex.the_arena_is_managed=0` did not show a significant performance hit and using this comms arena implementation did not harm the performance either. More comprehensive tests would be required to determine the effect on other codes and platforms. --------- Co-authored-by: Mukul Dave <mhdave@lbl.gov> Co-authored-by: Weiqun Zhang <WeiqunZhang@lbl.gov>
diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H
@@ -31,6 +31,7 @@ Arena* The_Async_Arena ();
 Arena* The_Device_Arena ();
 Arena* The_Managed_Arena ();
 Arena* The_Pinned_Arena ();
+Arena* The_Comms_Arena ();
 Arena* The_Cpu_Arena ();
 
 struct ArenaInfo
diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
@@ -34,15 +34,18 @@ namespace {
     Arena* the_managed_arena = nullptr;
     Arena* the_pinned_arena = nullptr;
     Arena* the_cpu_arena = nullptr;
+    Arena* the_comms_arena = nullptr;
 
     Long the_arena_init_size = 0L;
     Long the_device_arena_init_size = 1024*1024*8;
     Long the_managed_arena_init_size = 1024*1024*8;
     Long the_pinned_arena_init_size = 1024*1024*8;
+    Long the_comms_arena_init_size = 1024*1024*8;
     Long the_arena_release_threshold = std::numeric_limits<Long>::max();
     Long the_device_arena_release_threshold = std::numeric_limits<Long>::max();
     Long the_managed_arena_release_threshold = std::numeric_limits<Long>::max();
     Long the_pinned_arena_release_threshold = std::numeric_limits<Long>::max();
+    Long the_comms_arena_release_threshold = std::numeric_limits<Long>::max();
     Long the_async_arena_release_threshold = std::numeric_limits<Long>::max();
 #ifdef AMREX_USE_HIP
     bool the_arena_is_managed = false; // xxxxx HIP FIX HERE
@@ -276,6 +279,7 @@ Arena::Initialize ()
     BL_ASSERT(the_managed_arena == nullptr || the_managed_arena == The_BArena());
     BL_ASSERT(the_pinned_arena == nullptr);
     BL_ASSERT(the_cpu_arena == nullptr || the_cpu_arena == The_BArena());
+    BL_ASSERT(the_comms_arena == nullptr || the_comms_arena == The_BArena());
 
 #ifdef AMREX_USE_GPU
 #ifdef AMREX_USE_SYCL
@@ -292,10 +296,12 @@ Arena::Initialize ()
     pp.queryAdd( "the_device_arena_init_size",  the_device_arena_init_size);
     pp.queryAdd("the_managed_arena_init_size", the_managed_arena_init_size);
     pp.queryAdd( "the_pinned_arena_init_size",  the_pinned_arena_init_size);
+    pp.queryAdd( "the_comms_arena_init_size",  the_comms_arena_init_size);
     pp.queryAdd(       "the_arena_release_threshold" ,         the_arena_release_threshold);
     pp.queryAdd( "the_device_arena_release_threshold",  the_device_arena_release_threshold);
     pp.queryAdd("the_managed_arena_release_threshold", the_managed_arena_release_threshold);
     pp.queryAdd( "the_pinned_arena_release_threshold",  the_pinned_arena_release_threshold);
+    pp.queryAdd("the_comms_arena_release_threshold", the_comms_arena_release_threshold);
     pp.queryAdd(  "the_async_arena_release_threshold",   the_async_arena_release_threshold);
     pp.queryAdd("the_arena_is_managed", the_arena_is_managed);
     pp.queryAdd("abort_on_out_of_gpu_memory", abort_on_out_of_gpu_memory);
@@ -361,6 +367,22 @@ Arena::Initialize ()
                                   (the_pinned_arena_release_threshold));
     the_pinned_arena->registerForProfiling("Pinned Memory");
 
+#ifdef AMREX_USE_GPU
+    if (ParallelDescriptor::UseGpuAwareMpi()) {
+        if (!(the_arena->isDevice())) {
+            the_comms_arena = the_device_arena;
+        } else {
+            the_comms_arena = new CArena(0, ArenaInfo{}.SetDeviceMemory().SetReleaseThreshold
+                                        (the_comms_arena_release_threshold));
+            the_comms_arena->registerForProfiling("Comms Memory");
+        }
+    } else {
+        the_comms_arena = the_pinned_arena;
+    }
+#else
+    the_comms_arena = The_BArena();
+#endif
+
     if (the_device_arena_init_size > 0 && the_device_arena != the_arena) {
         BL_PROFILE("The_Device_Arena::Initialize()");
         void *p = the_device_arena->alloc(the_device_arena_init_size);
@@ -379,6 +401,13 @@ Arena::Initialize ()
         the_pinned_arena->free(p);
     }
 
+    if (the_comms_arena_init_size > 0 && the_comms_arena != the_arena
+        && the_comms_arena != the_device_arena && the_comms_arena != the_pinned_arena) {
+        BL_PROFILE("The_Comms_Arena::Initialize()");
+        void *p = the_comms_arena->alloc(the_comms_arena_init_size);
+        the_comms_arena->free(p);
+    }
+
     the_cpu_arena = The_BArena();
 
     // Initialize the null arena
@@ -440,6 +469,13 @@ Arena::PrintUsage ()
             p->PrintUsage("The  Pinned Arena");
         }
     }
+    if (The_Comms_Arena() && The_Comms_Arena() != The_Device_Arena()
+         && The_Comms_Arena() != The_Pinned_Arena()) {
+        auto* p = dynamic_cast<CArena*>(The_Comms_Arena());
+        if (p) {
+            p->PrintUsage("The   Comms Arena");
+        }
+    }
 }
 
 void
@@ -485,6 +521,13 @@ Arena::PrintUsageToFiles (const std::string& filename, const std::string& messag
             p->PrintUsage(ofs, "The  Pinned Arena", "    ");
         }
     }
+    if (The_Comms_Arena() && The_Comms_Arena() != The_Device_Arena()
+        && The_Comms_Arena() != The_Pinned_Arena()) {
+        auto* p = dynamic_cast<CArena*>(The_Comms_Arena());
+        if (p) {
+            p->PrintUsage(ofs, "The   Comms Arena", "    ");
+        }
+    }
 
     ofs << "\n";
 }
@@ -509,6 +552,13 @@ Arena::Finalize ()
     //   MultiFab mf(...);  // this should be scoped in { ... }
     //   amrex::Finalize();
     // mf cannot be used now, but it can at least be freed without a segfault
+    if (!dynamic_cast<BArena*>(the_comms_arena)) {
+        if (the_comms_arena != the_device_arena && the_comms_arena != the_pinned_arena) {
+            delete the_comms_arena;
+        }
+        the_comms_arena = nullptr;
+    }
+
     if (!dynamic_cast<BArena*>(the_device_arena)) {
         if (the_device_arena != the_arena) {
             delete the_device_arena;
@@ -600,4 +650,14 @@ The_Cpu_Arena ()
     }
 }
 
+Arena*
+The_Comms_Arena ()
+{
+    if        (the_comms_arena) {
+        return the_comms_arena;
+    } else {
+        return The_Null_Arena();
+    }
 }
+
+}
diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H
@@ -144,7 +144,7 @@ struct MFInfo {
 struct TheFaArenaDeleter {
     using pointer = char*;
     void operator()(pointer p) const noexcept {
-        The_FA_Arena()->free(p);
+        The_Comms_Arena()->free(p);
     }
 };
 using TheFaArenaPointer = std::unique_ptr<char, TheFaArenaDeleter>;
diff --git a/Src/Base/AMReX_FabArrayBase.H b/Src/Base/AMReX_FabArrayBase.H
@@ -721,8 +721,6 @@ bool CheckRcvStats (Vector<MPI_Status>& recv_stats, const Vector<std::size_t>& r
 
 std::ostream& operator<< (std::ostream& os, const FabArrayBase::BDKey& id);
 
-Arena* The_FA_Arena ();
-
 }
 
 #endif
diff --git a/Src/Base/AMReX_FabArrayBase.cpp b/Src/Base/AMReX_FabArrayBase.cpp
@@ -88,7 +88,6 @@ std::vector<std::string>                    FabArrayBase::m_region_tag;
 
 namespace
 {
-    Arena* the_fa_arena = nullptr;
     bool initialized = false;
 }
 
@@ -123,16 +122,6 @@ FabArrayBase::Initialize ()
         MaxComp = 1;
     }
 
-#ifdef AMREX_USE_GPU
-    if (ParallelDescriptor::UseGpuAwareMpi()) {
-        the_fa_arena = The_Device_Arena();
-    } else {
-        the_fa_arena = The_Pinned_Arena();
-    }
-#else
-    the_fa_arena = The_Cpu_Arena();
-#endif
-
     amrex::ExecOnFinalize(FabArrayBase::Finalize);
 
 #ifdef AMREX_MEM_PROFILING
@@ -159,12 +148,6 @@ FabArrayBase::Initialize ()
 #endif
 }
 
-Arena*
-The_FA_Arena ()
-{
-    return the_fa_arena;
-}
-
 FabArrayBase::FabArrayBase (const BoxArray&            bxs,
                             const DistributionMapping& dm,
                             int                        nvar,
@@ -2245,8 +2228,6 @@ FabArrayBase::Finalize ()
 
     m_FA_stats = FabArrayStats();
 
-    the_fa_arena = nullptr;
-
     initialized = false;
 }
 
diff --git a/Src/Base/AMReX_FabArrayCommI.H b/Src/Base/AMReX_FabArrayCommI.H
@@ -228,7 +228,7 @@ FabArray<FAB>::FillBoundary_finish ()
 
         if (fbd->the_recv_data)
         {
-            amrex::The_FA_Arena()->free(fbd->the_recv_data);
+            amrex::The_Comms_Arena()->free(fbd->the_recv_data);
             fbd->the_recv_data = nullptr;
         }
     }
@@ -237,7 +237,7 @@ FabArray<FAB>::FillBoundary_finish ()
     if (N_snds > 0) {
         Vector<MPI_Status> stats(fbd->send_reqs.size());
         ParallelDescriptor::Waitall(fbd->send_reqs, stats);
-        amrex::The_FA_Arena()->free(fbd->the_send_data);
+        amrex::The_Comms_Arena()->free(fbd->the_send_data);
         fbd->the_send_data = nullptr;
     }
 
@@ -548,7 +548,7 @@ FabArray<FAB>::ParallelCopy_finish ()
 
         if (pcd->the_recv_data)
         {
-            amrex::The_FA_Arena()->free(pcd->the_recv_data);
+            amrex::The_Comms_Arena()->free(pcd->the_recv_data);
             pcd->the_recv_data = nullptr;
         }
     }
@@ -558,7 +558,7 @@ FabArray<FAB>::ParallelCopy_finish ()
             Vector<MPI_Status> stats(pcd->send_reqs.size());
             ParallelDescriptor::Waitall(pcd->send_reqs, stats);
         }
-        amrex::The_FA_Arena()->free(pcd->the_send_data);
+        amrex::The_Comms_Arena()->free(pcd->the_send_data);
         pcd->the_send_data = nullptr;
     }
 
@@ -685,7 +685,7 @@ FabArray<FAB>::PrepareSendBuffers (const MapOfCopyComTagContainers&     SndTags,
 
     if (total_volume > 0)
     {
-        the_send_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(total_volume));
+        the_send_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(total_volume));
         for (int i = 0, N = static_cast<int>(send_size.size()); i < N; ++i) {
             send_data[i] = the_send_data + offset[i];
         }
@@ -783,7 +783,7 @@ FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers&  RcvTags,
     }
     else
     {
-        the_recv_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(TotalRcvsVolume));
+        the_recv_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalRcvsVolume));
 
         for (int i = 0; i < nrecv; ++i)
         {
@@ -1004,7 +1004,7 @@ FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,
             recv_size.push_back(nbytes);
         }
 
-        the_recv_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(TotalRcvsVolume));
+        the_recv_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalRcvsVolume));
 
         int k = 0;
         for (int i = 0; i < nrecv; ++i) {
@@ -1077,7 +1077,7 @@ FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,
             send_size.push_back(nbytes);
         }
 
-        the_send_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(TotalSndsVolume));
+        the_send_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalSndsVolume));
         int k = 0;
         for (int i = 0; i < nsend; ++i) {
             send_data[i] = the_send_data + offset[i];
@@ -1113,13 +1113,13 @@ FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,
 
         detail::fbv_copy(recv_tags);
 
-        amrex::The_FA_Arena()->free(the_recv_data);
+        amrex::The_Comms_Arena()->free(the_recv_data);
     }
 
     if (N_snds > 0) {
         Vector<MPI_Status> stats(send_reqs.size());
         ParallelDescriptor::Waitall(send_reqs, stats);
-        amrex::The_FA_Arena()->free(the_send_data);
+        amrex::The_Comms_Arena()->free(the_send_data);
     }
 
 #endif  // #ifdef AMREX_USE_MPI
diff --git a/Src/Base/AMReX_NonLocalBC.cpp b/Src/Base/AMReX_NonLocalBC.cpp
@@ -95,7 +95,7 @@ void PrepareCommBuffers(CommData& comm,
     }
     else
     {
-        comm.the_data.reset(static_cast<char*>(amrex::The_FA_Arena()->alloc(total_volume)));
+        comm.the_data.reset(static_cast<char*>(amrex::The_Comms_Arena()->alloc(total_volume)));
         for (int i = 0; i < N_comms; ++i) {
             comm.data[i] = comm.the_data.get() + comm.offset[i];
         }

Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ struct MFInfo {`
`144`	`144`	`struct TheFaArenaDeleter {`
`145`	`145`	`using pointer = char*;`
`146`	`146`	`void operator()(pointer p) const noexcept {`
`147`		`- The_FA_Arena()->free(p);`
	`147`	`+ The_Comms_Arena()->free(p);`
`148`	`148`	`}`
`149`	`149`	`};`
`150`	`150`	`using TheFaArenaPointer = std::unique_ptr<char, TheFaArenaDeleter>;`
Original file line number	Diff line number	Diff line change
`@@ -721,8 +721,6 @@ bool CheckRcvStats (Vector<MPI_Status>& recv_stats, const Vector<std::size_t>& r`
`721`	`721`
`722`	`722`	`std::ostream& operator<< (std::ostream& os, const FabArrayBase::BDKey& id);`
`723`	`723`
`724`		`-Arena* The_FA_Arena ();`
`725`		`-`
`726`	`724`	`}`
`727`	`725`
`728`	`726`	`#endif`
Original file line number	Diff line number	Diff line change
`@@ -228,7 +228,7 @@ FabArray<FAB>::FillBoundary_finish ()`
`228`	`228`
`229`	`229`	`if (fbd->the_recv_data)`
`230`	`230`	`{`
`231`		`- amrex::The_FA_Arena()->free(fbd->the_recv_data);`
	`231`	`+ amrex::The_Comms_Arena()->free(fbd->the_recv_data);`
`232`	`232`	`fbd->the_recv_data = nullptr;`
`233`	`233`	`}`
`234`	`234`	`}`
`@@ -237,7 +237,7 @@ FabArray<FAB>::FillBoundary_finish ()`
`237`	`237`	`if (N_snds > 0) {`
`238`	`238`	`Vector<MPI_Status> stats(fbd->send_reqs.size());`
`239`	`239`	`ParallelDescriptor::Waitall(fbd->send_reqs, stats);`
`240`		`- amrex::The_FA_Arena()->free(fbd->the_send_data);`
	`240`	`+ amrex::The_Comms_Arena()->free(fbd->the_send_data);`
`241`	`241`	`fbd->the_send_data = nullptr;`
`242`	`242`	`}`
`243`	`243`
`@@ -548,7 +548,7 @@ FabArray<FAB>::ParallelCopy_finish ()`
`548`	`548`
`549`	`549`	`if (pcd->the_recv_data)`
`550`	`550`	`{`
`551`		`- amrex::The_FA_Arena()->free(pcd->the_recv_data);`
	`551`	`+ amrex::The_Comms_Arena()->free(pcd->the_recv_data);`
`552`	`552`	`pcd->the_recv_data = nullptr;`
`553`	`553`	`}`
`554`	`554`	`}`
`@@ -558,7 +558,7 @@ FabArray<FAB>::ParallelCopy_finish ()`
`558`	`558`	`Vector<MPI_Status> stats(pcd->send_reqs.size());`
`559`	`559`	`ParallelDescriptor::Waitall(pcd->send_reqs, stats);`
`560`	`560`	`}`
`561`		`- amrex::The_FA_Arena()->free(pcd->the_send_data);`
	`561`	`+ amrex::The_Comms_Arena()->free(pcd->the_send_data);`
`562`	`562`	`pcd->the_send_data = nullptr;`
`563`	`563`	`}`
`564`	`564`
`@@ -685,7 +685,7 @@ FabArray<FAB>::PrepareSendBuffers (const MapOfCopyComTagContainers& SndTags,`
`685`	`685`
`686`	`686`	`if (total_volume > 0)`
`687`	`687`	`{`
`688`		`- the_send_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(total_volume));`
	`688`	`+ the_send_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(total_volume));`
`689`	`689`	`for (int i = 0, N = static_cast<int>(send_size.size()); i < N; ++i) {`
`690`	`690`	`send_data[i] = the_send_data + offset[i];`
`691`	`691`	`}`
`@@ -783,7 +783,7 @@ FabArray<FAB>::PostRcvs (const MapOfCopyComTagContainers& RcvTags,`
`783`	`783`	`}`
`784`	`784`	`else`
`785`	`785`	`{`
`786`		`- the_recv_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(TotalRcvsVolume));`
	`786`	`+ the_recv_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalRcvsVolume));`
`787`	`787`
`788`	`788`	`for (int i = 0; i < nrecv; ++i)`
`789`	`789`	`{`
`@@ -1004,7 +1004,7 @@ FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,`
`1004`	`1004`	`recv_size.push_back(nbytes);`
`1005`	`1005`	`}`
`1006`	`1006`
`1007`		`- the_recv_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(TotalRcvsVolume));`
	`1007`	`+ the_recv_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalRcvsVolume));`
`1008`	`1008`
`1009`	`1009`	`int k = 0;`
`1010`	`1010`	`for (int i = 0; i < nrecv; ++i) {`
`@@ -1077,7 +1077,7 @@ FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,`
`1077`	`1077`	`send_size.push_back(nbytes);`
`1078`	`1078`	`}`
`1079`	`1079`
`1080`		`- the_send_data = static_cast<char*>(amrex::The_FA_Arena()->alloc(TotalSndsVolume));`
	`1080`	`+ the_send_data = static_cast<char*>(amrex::The_Comms_Arena()->alloc(TotalSndsVolume));`
`1081`	`1081`	`int k = 0;`
`1082`	`1082`	`for (int i = 0; i < nsend; ++i) {`
`1083`	`1083`	`send_data[i] = the_send_data + offset[i];`
`@@ -1113,13 +1113,13 @@ FillBoundary (Vector<MF*> const& mf, Vector<int> const& scomp,`
`1113`	`1113`
`1114`	`1114`	`detail::fbv_copy(recv_tags);`
`1115`	`1115`
`1116`		`- amrex::The_FA_Arena()->free(the_recv_data);`
	`1116`	`+ amrex::The_Comms_Arena()->free(the_recv_data);`
`1117`	`1117`	`}`
`1118`	`1118`
`1119`	`1119`	`if (N_snds > 0) {`
`1120`	`1120`	`Vector<MPI_Status> stats(send_reqs.size());`
`1121`	`1121`	`ParallelDescriptor::Waitall(send_reqs, stats);`
`1122`		`- amrex::The_FA_Arena()->free(the_send_data);`
	`1122`	`+ amrex::The_Comms_Arena()->free(the_send_data);`
`1123`	`1123`	`}`
`1124`	`1124`
`1125`	`1125`	`#endif // #ifdef AMREX_USE_MPI`
Original file line number	Diff line number	Diff line change
`@@ -95,7 +95,7 @@ void PrepareCommBuffers(CommData& comm,`
`95`	`95`	`}`
`96`	`96`	`else`
`97`	`97`	`{`
`98`		`- comm.the_data.reset(static_cast<char*>(amrex::The_FA_Arena()->alloc(total_volume)));`
	`98`	`+ comm.the_data.reset(static_cast<char*>(amrex::The_Comms_Arena()->alloc(total_volume)));`
`99`	`99`	`for (int i = 0; i < N_comms; ++i) {`
`100`	`100`	`comm.data[i] = comm.the_data.get() + comm.offset[i];`
`101`	`101`	`}`