AMReX-Codes · AlexanderSinn · Apr 27, 2025 · Apr 27, 2025 · May 9, 2025 · May 11, 2025
diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
@@ -175,6 +175,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
     {
         std::size_t free_mem_avail = Gpu::Device::freeMemAvailable();
         if (nbytes >= free_mem_avail) {
+            Gpu::streamSynchronizeAll(); // this could cause some memory to be freed
             free_mem_avail += freeUnused_protected(); // For CArena, mutex has already acquired
             if (abort_on_out_of_gpu_memory && nbytes >= free_mem_avail) {
                 amrex::Abort("Out of gpu memory. Free: " + std::to_string(free_mem_avail)

diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H
@@ -63,6 +63,8 @@ public:
     */
     void free (void* vp) final;
 
+    void free_now (void* vp);
+
     std::size_t freeUnused () final;
 
     /**

diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp
@@ -2,6 +2,7 @@
 #include <AMReX_CArena.H>
 #include <AMReX_BLassert.H>
 #include <AMReX_Gpu.H>
+#include <AMReX_GpuDevice.H>
 #include <AMReX_ParallelReduce.H>
 
 #include <utility>
@@ -265,6 +266,23 @@ CArena::free (void* vp)
         return;
     }
 
+    if (this->isDeviceAccessible()) {
+        Gpu::Device::freeAfterSync(this, vp);
+    } else {
+        free_now(vp);
+    }
+}
+
+void
+CArena::free_now (void* vp)
+{
+    if (vp == nullptr) {
+        //
+        // Allow calls with NULL as allowed by C++ delete.
+        //
+        return;
+    }
+
     std::lock_guard<std::mutex> lock(carena_mutex);
 
     //

diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
@@ -16,6 +16,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <memory>
+#include <mutex>
 
 #define AMREX_GPU_MAX_STREAMS 8
 
@@ -46,8 +47,28 @@ using gpuDeviceProp_t = cudaDeviceProp;
 }
 #endif
 
+namespace amrex {
+    class CArena;
+}
+
 namespace amrex::Gpu {
 
+#ifdef AMREX_USE_GPU
+class StreamManager {
+    gpuStream_t m_stream;
+    Vector<int> m_is_synced;
+    Vector<std::pair<CArena*, void*>> m_free_wait_list;
+    std::mutex m_mutex;
+public:
+    StreamManager ();
+    [[nodiscard]] gpuStream_t get ();
+    [[nodiscard]] gpuStream_t& internal_get ();
+    void sync ();
+    void internal_after_sync ();
+    void stream_free (CArena* arena, void* mem);
+};
+#endif
+
 class Device
 {
 
@@ -57,17 +78,27 @@ public:
     static void Finalize ();
 
 #if defined(AMREX_USE_GPU)
-    static gpuStream_t gpuStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
+    static gpuStream_t gpuStream () noexcept {
+        return gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].get();
+    }
 #ifdef AMREX_USE_CUDA
     /** for backward compatibility */
-    static cudaStream_t cudaStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
+    static cudaStream_t cudaStream () noexcept {
+        return gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].get();
+    }
 #endif
 #ifdef AMREX_USE_SYCL
-    static sycl::queue& streamQueue () noexcept { return *(gpu_stream[OpenMP::get_thread_num()].queue); }
-    static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].queue); }
+    static sycl::queue& streamQueue () noexcept {
+        return *(gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].get().queue);
+    }
+    static sycl::queue& streamQueue (int i) noexcept {
+        return *(gpu_stream_pool[i].get().queue);
+    }
 #endif
 #endif
 
+    static void freeAfterSync (CArena* arena, void* mem) noexcept;
+
     static int numGpuStreams () noexcept {
         return inSingleStreamRegion() ? 1 : max_gpu_streams;
     }
@@ -104,6 +135,16 @@ public:
      */
     static void streamSynchronizeAll () noexcept;
 
+#ifdef AMREX_USE_GPU
+    /**
+     * Halt execution of code until the current AMReX GPU stream has finished processing all
+     * previously requested tasks. Unlike streamSynchronize which avoids redundant
+     * synchronizations when being called multiple times in a row,
+     * this function always causes the GPU stream to be synchronized
+     */
+    static void actualStreamSynchronize (gpuStream_t stream) noexcept;
+#endif
+
 #if defined(__CUDACC__)
     /**  Generic graph selection. These should be called by users.  */
     static void startGraphRecording(bool first_iter, void* h_ptr, void* d_ptr, size_t sz);
@@ -196,18 +237,25 @@ private:
     static AMREX_EXPORT dim3 numThreadsMin;
     static AMREX_EXPORT dim3 numBlocksOverride, numThreadsOverride;
 
-    static AMREX_EXPORT Vector<gpuStream_t> gpu_stream_pool; // The size of this is max_gpu_stream
-    // The non-owning gpu_stream is used to store the current stream that will be used.
-    // gpu_stream is a vector so that it's thread safe to write to it.
-    static AMREX_EXPORT Vector<gpuStream_t> gpu_stream; // The size of this is omp_max_threads
+    static AMREX_EXPORT Vector<StreamManager> gpu_stream_pool; // The size of this is max_gpu_stream
+    // The non-owning gpu_stream_index is used to store the current stream index that will be used.
+    // gpu_stream_index is a vector so that it's thread safe to write to it.
+    static AMREX_EXPORT Vector<int> gpu_stream_index; // The size of this is omp_max_threads
+
     static AMREX_EXPORT gpuDeviceProp_t device_prop;
     static AMREX_EXPORT int memory_pools_supported;
     static AMREX_EXPORT unsigned int max_blocks_per_launch;
 
+    static AMREX_EXPORT bool sync_before_memory_free;
+    static AMREX_EXPORT bool delay_memory_free_until_sync;
+    static AMREX_EXPORT bool avoid_double_sync;
+
 #ifdef AMREX_USE_SYCL
     static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
     static AMREX_EXPORT std::unique_ptr<sycl::device>  sycl_device;
 #endif
+
+    friend StreamManager;
 #endif
 };
-Original file line number
+Diff line change
@@ Expand Up / @@ -63,6 +63,8 @@ public: @@
         */
         void free (void* vp) final;
+        void free_now (void* vp);
         std::size_t freeUnused () final;
         /**
@@ Expand Down @@