From da4705564d6cfbb9fa92966fd3f47b3a4082df3a Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Sun, 27 Apr 2025 19:22:59 +0200
Subject: [PATCH 1/6] Remove need to sync Gpu stream before dealloc

---
 Src/Base/AMReX_CArena.H      |   2 +
 Src/Base/AMReX_CArena.cpp    |  18 +++++
 Src/Base/AMReX_GpuDevice.H   |  47 ++++++++++--
 Src/Base/AMReX_GpuDevice.cpp | 145 ++++++++++++++++++++++++-----------
 4 files changed, 159 insertions(+), 53 deletions(-)
diff --git a/Src/Base/AMReX_CArena.H b/Src/Base/AMReX_CArena.H
index bc46d008241..852f34e5573 100644
--- a/Src/Base/AMReX_CArena.H
+++ b/Src/Base/AMReX_CArena.H
@@ -63,6 +63,8 @@ public:
     */
     void free (void* vp) final;
 
+    void free_now (void* vp);
+
     std::size_t freeUnused () final;
 
     /**
diff --git a/Src/Base/AMReX_CArena.cpp b/Src/Base/AMReX_CArena.cpp
index bc5297f4d52..35437ae63cc 100644
--- a/Src/Base/AMReX_CArena.cpp
+++ b/Src/Base/AMReX_CArena.cpp
@@ -2,6 +2,7 @@
 #include <AMReX_CArena.H>
 #include <AMReX_BLassert.H>
 #include <AMReX_Gpu.H>
+#include <AMReX_GpuDevice.H>
 #include <AMReX_MFIter.H>
 #include <AMReX_ParallelReduce.H>
 
@@ -262,6 +263,23 @@ CArena::free (void* vp)
         return;
     }
 
+    if (this->isDeviceAccessible()) {
+        Gpu::Device::freeAfterSync(this, vp);
+    } else {
+        free_now(vp);
+    }
+}
+
+void
+CArena::free_now (void* vp)
+{
+    if (vp == nullptr) {
+        //
+        // Allow calls with NULL as allowed by C++ delete.
+        //
+        return;
+    }
+
     std::lock_guard<std::mutex> lock(carena_mutex);
 
     //
diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
index eb2b95943e7..abe2f66c823 100644
--- a/Src/Base/AMReX_GpuDevice.H
+++ b/Src/Base/AMReX_GpuDevice.H
@@ -16,6 +16,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <memory>
+#include <mutex>
 
 #define AMREX_GPU_MAX_STREAMS 8
 
@@ -43,11 +44,28 @@ using gpuDeviceProp_t = cudaDeviceProp;
         int maxParameterSize;
     };
 #endif
+
+class CArena;
 }
 #endif
 
 namespace amrex::Gpu {
 
+#ifdef AMREX_USE_GPU
+class StreamManager {
+    gpuStream_t m_stream;
+    bool m_is_synced = true;
+    Vector<std::pair<CArena*, void*>> m_free_wait_list;
+    std::mutex m_mutex;
+public:
+    [[nodiscard]] gpuStream_t get ();
+    [[nodiscard]] gpuStream_t& internal_get ();
+    void sync ();
+    void internal_after_sync ();
+    void stream_free (CArena* arena, void* mem);
+};
+#endif
+
 class Device
 {
 
@@ -57,17 +75,29 @@ public:
     static void Finalize ();
 
 #if defined(AMREX_USE_GPU)
-    static gpuStream_t gpuStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
+    static gpuStream_t gpuStream () noexcept {
+        return gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].get();
+    }
 #ifdef AMREX_USE_CUDA
     /** for backward compatibility */
-    static cudaStream_t cudaStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
+    static cudaStream_t cudaStream () noexcept {
+        return gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].get();
+    }
 #endif
 #ifdef AMREX_USE_SYCL
-    static sycl::queue& streamQueue () noexcept { return *(gpu_stream[OpenMP::get_thread_num()].queue); }
-    static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].queue); }
+    static sycl::queue& streamQueue () noexcept {
+        return *(gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].get().queue);
+    }
+    static sycl::queue& streamQueue (int i) noexcept {
+        return *(gpu_stream_pool[i].get().queue);
+    }
 #endif
 #endif
 
+    static void freeAfterSync (CArena* arena, void* mem) noexcept {
+        gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].stream_free(arena, mem);
+    }
+
     static int numGpuStreams () noexcept {
         return inSingleStreamRegion() ? 1 : max_gpu_streams;
     }
@@ -196,10 +226,11 @@ private:
     static AMREX_EXPORT dim3 numThreadsMin;
     static AMREX_EXPORT dim3 numBlocksOverride, numThreadsOverride;
 
-    static AMREX_EXPORT Vector<gpuStream_t> gpu_stream_pool; // The size of this is max_gpu_stream
-    // The non-owning gpu_stream is used to store the current stream that will be used.
-    // gpu_stream is a vector so that it's thread safe to write to it.
-    static AMREX_EXPORT Vector<gpuStream_t> gpu_stream; // The size of this is omp_max_threads
+    static AMREX_EXPORT Vector<StreamManager> gpu_stream_pool; // The size of this is max_gpu_stream
+    // The non-owning gpu_stream_index is used to store the current stream index that will be used.
+    // gpu_stream_index is a vector so that it's thread safe to write to it.
+    static AMREX_EXPORT Vector<int> gpu_stream_index; // The size of this is omp_max_threads
+
     static AMREX_EXPORT gpuDeviceProp_t device_prop;
     static AMREX_EXPORT int memory_pools_supported;
     static AMREX_EXPORT unsigned int max_blocks_per_launch;
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index 1bc05cee196..8e67119015c 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -1,4 +1,5 @@
 
+#include <AMReX_CArena.H>
 #include <AMReX_GpuDevice.H>
 #include <AMReX_GpuLaunch.H>
 #include <AMReX_Machine.H>
@@ -93,15 +94,75 @@ int Device::max_gpu_streams = 1;
 #endif
 
 #ifdef AMREX_USE_GPU
+
+[[nodiscard]] gpuStream_t
+StreamManager::get () {
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    m_is_synced = false;
+    return m_stream;
+}
+
+[[nodiscard]] gpuStream_t&
+StreamManager::internal_get () {
+    return m_stream;
+}
+
+void
+StreamManager::sync () {
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    if (!m_is_synced) {
+        m_is_synced = true;
+#ifdef AMREX_USE_SYCL
+        auto& q = *(m_stream.queue);
+        try {
+            q.wait_and_throw();
+        } catch (sycl::exception const& ex) {
+            amrex::Abort(std::string("streamSynchronize: ")+ex.what()+"!!!!!");
+        }
+#else
+        AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(m_stream));,
+                           AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream)); )
+#endif
+        for (auto [arena, mem] : m_free_wait_list) {
+            arena->free_now(mem);
+        }
+        m_free_wait_list.clear();
+    }
+}
+
+void
+StreamManager::internal_after_sync () {
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    m_is_synced = true;
+    for (auto [arena, mem] : m_free_wait_list) {
+        arena->free_now(mem);
+    }
+    m_free_wait_list.clear();
+}
+
+void
+StreamManager::stream_free (CArena* arena, void* mem) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    if (m_is_synced) {
+        arena->free_now(mem);
+    } else {
+        m_free_wait_list.emplace_back(arena, mem);
+    }
+}
+
 dim3 Device::numThreadsMin      = dim3(1, 1, 1);
 dim3 Device::numThreadsOverride = dim3(0, 0, 0);
 dim3 Device::numBlocksOverride  = dim3(0, 0, 0);
 unsigned int Device::max_blocks_per_launch = 2560;
 
-Vector<gpuStream_t> Device::gpu_stream_pool;
-Vector<gpuStream_t> Device::gpu_stream;
-gpuDeviceProp_t     Device::device_prop;
-int                 Device::memory_pools_supported = 0;
+Vector<StreamManager>   Device::gpu_stream_pool;
+Vector<int>             Device::gpu_stream_index;
+gpuDeviceProp_t         Device::device_prop;
+int                     Device::memory_pools_supported = 0;
 
 constexpr int Device::warp_size;
 
@@ -383,24 +444,25 @@ void
 Device::Finalize ()
 {
 #ifdef AMREX_USE_GPU
+    streamSynchronizeAll();
     Device::profilerStop();
 
 #ifdef AMREX_USE_SYCL
     for (auto& s : gpu_stream_pool) {
-        delete s.queue;
-        s.queue = nullptr;
+        delete s.internal_get().queue;
+        s.internal_get().queue = nullptr;
     }
     sycl_context.reset();
     sycl_device.reset();
 #else
     for (int i = 0; i < max_gpu_streams; ++i)
     {
-        AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL( hipStreamDestroy(gpu_stream_pool[i]));,
-                          AMREX_CUDA_SAFE_CALL(cudaStreamDestroy(gpu_stream_pool[i])); );
+        AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL( hipStreamDestroy(gpu_stream_pool[i].internal_get()));,
+                          AMREX_CUDA_SAFE_CALL(cudaStreamDestroy(gpu_stream_pool[i].internal_get())); );
     }
 #endif
 
-    gpu_stream.clear();
+    gpu_stream_index.clear();
 
 #ifdef AMREX_USE_ACC
     amrex_finalize_acc();
@@ -416,7 +478,9 @@ Device::initialize_gpu (bool minimal)
 
 #ifdef AMREX_USE_GPU
 
-    gpu_stream_pool.resize(max_gpu_streams);
+    if (gpu_stream_pool.size() != max_gpu_streams) {
+        gpu_stream_pool = Vector<StreamManager>(max_gpu_streams);
+    }
 
 #ifdef AMREX_USE_HIP
 
@@ -429,7 +493,7 @@ Device::initialize_gpu (bool minimal)
     // AMD devices do not support shared cache banking.
 
     for (int i = 0; i < max_gpu_streams; ++i) {
-        AMREX_HIP_SAFE_CALL(hipStreamCreate(&gpu_stream_pool[i]));
+        AMREX_HIP_SAFE_CALL(hipStreamCreate(&gpu_stream_pool[i].internal_get()));
     }
 
 #ifdef AMREX_GPU_STREAM_ALLOC_SUPPORT
@@ -457,9 +521,9 @@ Device::initialize_gpu (bool minimal)
 #endif
 
     for (int i = 0; i < max_gpu_streams; ++i) {
-        AMREX_CUDA_SAFE_CALL(cudaStreamCreate(&gpu_stream_pool[i]));
+        AMREX_CUDA_SAFE_CALL(cudaStreamCreate(&gpu_stream_pool[i].internal_get()));
 #ifdef AMREX_USE_ACC
-        acc_set_cuda_stream(i, gpu_stream_pool[i]);
+        acc_set_cuda_stream(i, gpu_stream_pool[i].internal_get());
 #endif
     }
 
@@ -472,7 +536,7 @@ Device::initialize_gpu (bool minimal)
         sycl_device = std::make_unique<sycl::device>(gpu_devices[device_id]);
         sycl_context = std::make_unique<sycl::context>(*sycl_device, amrex_sycl_error_handler);
         for (int i = 0; i < max_gpu_streams; ++i) {
-            gpu_stream_pool[i].queue = new sycl::queue(*sycl_context, *sycl_device,
+            gpu_stream_pool[i].internal_get().queue = new sycl::queue(*sycl_context, *sycl_device,
                                          sycl::property_list{sycl::property::queue::in_order{}});
         }
     }
@@ -555,7 +619,7 @@ Device::initialize_gpu (bool minimal)
     }
 #endif
 
-    gpu_stream.resize(OpenMP::get_max_threads(), gpu_stream_pool[0]);
+    gpu_stream_index.resize(OpenMP::get_max_threads(), 0);
 
     ParmParse pp("device");
 
@@ -625,8 +689,13 @@ int Device::numDevicePartners () noexcept
 int
 Device::streamIndex (gpuStream_t s) noexcept
 {
-    auto it = std::find(std::begin(gpu_stream_pool), std::end(gpu_stream_pool), s);
-    return static_cast<int>(std::distance(std::begin(gpu_stream_pool), it));
+    const int N = gpu_stream_pool.size();
+    for (int i = 0; i < N ; ++i) {
+        if (gpu_stream_pool[i].internal_get() == s) {
+            return i;
+        }
+    }
+    return N;
 }
 #endif
 
@@ -635,7 +704,7 @@ Device::setStreamIndex (int idx) noexcept
 {
     amrex::ignore_unused(idx);
 #ifdef AMREX_USE_GPU
-    gpu_stream[OpenMP::get_thread_num()] = gpu_stream_pool[idx % max_gpu_streams];
+    gpu_stream_index[OpenMP::get_thread_num()] = idx % max_gpu_streams;
 #ifdef AMREX_USE_ACC
     amrex_set_acc_stream(idx % max_gpu_streams);
 #endif
@@ -646,16 +715,16 @@ Device::setStreamIndex (int idx) noexcept
 gpuStream_t
 Device::resetStream () noexcept
 {
-    gpuStream_t r = gpu_stream[OpenMP::get_thread_num()];
-    gpu_stream[OpenMP::get_thread_num()] = gpu_stream_pool[0];
+    gpuStream_t r = gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].get();
+    gpu_stream_index[OpenMP::get_thread_num()] = 0;
     return r;
 }
 
 gpuStream_t
 Device::setStream (gpuStream_t s) noexcept
 {
-    gpuStream_t r = gpu_stream[OpenMP::get_thread_num()];
-    gpu_stream[OpenMP::get_thread_num()] = s;
+    gpuStream_t r = gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].get();
+    gpu_stream_index[OpenMP::get_thread_num()] = streamIndex(s);
     return r;
 }
 #endif
@@ -664,32 +733,23 @@ void
 Device::synchronize () noexcept
 {
 #ifdef AMREX_USE_SYCL
-    for (auto const& s : gpu_stream_pool) {
-        try {
-            s.queue->wait_and_throw();
-        } catch (sycl::exception const& ex) {
-            amrex::Abort(std::string("synchronize: ")+ex.what()+"!!!!!");
-        }
+    for (auto& s : gpu_stream_pool) {
+        s.sync();
     }
 #else
     AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipDeviceSynchronize());,
                        AMREX_CUDA_SAFE_CALL(cudaDeviceSynchronize()); )
+    for (auto& s : gpu_stream_pool) {
+        s.internal_after_sync();
+    }
 #endif
 }
 
 void
 Device::streamSynchronize () noexcept
 {
-#ifdef AMREX_USE_SYCL
-    auto& q = streamQueue();
-    try {
-        q.wait_and_throw();
-    } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("streamSynchronize: ")+ex.what()+"!!!!!");
-    }
-#else
-    AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(gpuStream()));,
-                       AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(gpuStream())); )
+#ifdef AMREX_USE_GPU
+    gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].sync();
 #endif
 }
 
@@ -697,15 +757,10 @@ void
 Device::streamSynchronizeAll () noexcept
 {
 #ifdef AMREX_USE_GPU
-#ifdef AMREX_USE_SYCL
-    Device::synchronize();
-#else
-    for (auto const& s : gpu_stream_pool) {
-        AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(s));,
-                           AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(s)); )
+    for (auto& s : gpu_stream_pool) {
+        s.sync();
     }
 #endif
-#endif
 }
 
 #if defined(__CUDACC__) && defined(AMREX_USE_CUDA)

From 4872a96b1ddc89e7128ff7a37a823d5004a0471f Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Sun, 27 Apr 2025 19:46:35 +0200
Subject: [PATCH 2/6] fix

---
 Src/Base/AMReX_GpuDevice.H   | 9 +++++++--
 Src/Base/AMReX_GpuDevice.cpp | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
index abe2f66c823..cb4d706a515 100644
--- a/Src/Base/AMReX_GpuDevice.H
+++ b/Src/Base/AMReX_GpuDevice.H
@@ -44,11 +44,13 @@ using gpuDeviceProp_t = cudaDeviceProp;
         int maxParameterSize;
     };
 #endif
-
-class CArena;
 }
 #endif
 
+namespace amrex {
+    class CArena;
+}
+
 namespace amrex::Gpu {
 
 #ifdef AMREX_USE_GPU
@@ -95,7 +97,10 @@ public:
 #endif
 
     static void freeAfterSync (CArena* arena, void* mem) noexcept {
+        amrex::ignore_unused(arena, mem);
+#ifdef AMREX_USE_CUDA
         gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].stream_free(arena, mem);
+#endif
     }
 
     static int numGpuStreams () noexcept {
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index 8e67119015c..fcc7af21253 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -736,7 +736,7 @@ Device::synchronize () noexcept
     for (auto& s : gpu_stream_pool) {
         s.sync();
     }
-#else
+#elif defined(AMREX_USE_GPU)
     AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipDeviceSynchronize());,
                        AMREX_CUDA_SAFE_CALL(cudaDeviceSynchronize()); )
     for (auto& s : gpu_stream_pool) {

From 0b2c241f2f74bdcd8a7ba88955d8e8812f69ee8c Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Sun, 11 May 2025 23:09:07 +0200
Subject: [PATCH 3/6] more robust threading and better streamSync error

---
 Src/Base/AMReX_Arena.cpp     |   1 +
 Src/Base/AMReX_GpuDevice.H   |  13 +++-
 Src/Base/AMReX_GpuDevice.cpp | 118 +++++++++++++++++++++++++++--------
 Src/Base/AMReX_GpuElixir.cpp |   8 ++-
 4 files changed, 113 insertions(+), 27 deletions(-)

diff --git a/Src/Base/AMReX_Arena.cpp b/Src/Base/AMReX_Arena.cpp
index ff33163884e..4a20c7c6680 100644
--- a/Src/Base/AMReX_Arena.cpp
+++ b/Src/Base/AMReX_Arena.cpp
@@ -175,6 +175,7 @@ Arena::allocate_system (std::size_t nbytes) // NOLINT(readability-make-member-fu
     {
         std::size_t free_mem_avail = Gpu::Device::freeMemAvailable();
         if (nbytes >= free_mem_avail) {
+            Gpu::streamSynchronizeAll(); // this could cause some memory to be freed
             free_mem_avail += freeUnused_protected(); // For CArena, mutex has already acquired
             if (abort_on_out_of_gpu_memory && nbytes >= free_mem_avail) {
                 amrex::Abort("Out of gpu memory. Free: " + std::to_string(free_mem_avail)
diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H
index cb4d706a515..979433e0ee8 100644
--- a/Src/Base/AMReX_GpuDevice.H
+++ b/Src/Base/AMReX_GpuDevice.H
@@ -56,7 +56,8 @@ namespace amrex::Gpu {
 #ifdef AMREX_USE_GPU
 class StreamManager {
     gpuStream_t m_stream;
-    bool m_is_synced = true;
+    std::uint64_t m_stream_op_id = 0;
+    std::uint64_t m_last_sync = 0;
     Vector<std::pair<CArena*, void*>> m_free_wait_list;
     std::mutex m_mutex;
 public:
@@ -139,6 +140,16 @@ public:
      */
     static void streamSynchronizeAll () noexcept;
 
+#ifdef AMREX_USE_GPU
+    /**
+     * Halt execution of code until the current AMReX GPU stream has finished processing all
+     * previously requested tasks. Unlike streamSynchronize which avoids redundant
+     * synchronizations when being called multiple times in a row,
+     * this function always causes the GPU stream to be synchronized
+     */
+    static void actualStreamSynchronize (gpuStream_t stream) noexcept;
+#endif
+
 #if defined(__CUDACC__)
     /**  Generic graph selection. These should be called by users.  */
     static void startGraphRecording(bool first_iter, void* h_ptr, void* d_ptr, size_t sz);
diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index fcc7af21253..d69b4d23948 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -98,8 +98,7 @@ int Device::max_gpu_streams = 1;
 [[nodiscard]] gpuStream_t
 StreamManager::get () {
     std::lock_guard<std::mutex> lock(m_mutex);
-
-    m_is_synced = false;
+    ++m_stream_op_id;
     return m_stream;
 }
 
@@ -110,47 +109,75 @@ StreamManager::internal_get () {
 
 void
 StreamManager::sync () {
-    std::lock_guard<std::mutex> lock(m_mutex);
 
-    if (!m_is_synced) {
-        m_is_synced = true;
-#ifdef AMREX_USE_SYCL
-        auto& q = *(m_stream.queue);
-        try {
-            q.wait_and_throw();
-        } catch (sycl::exception const& ex) {
-            amrex::Abort(std::string("streamSynchronize: ")+ex.what()+"!!!!!");
+    bool is_synced = false;
+    std::uint64_t sync_op = 0;
+    decltype(m_free_wait_list) new_empty_wait_list{};
+    {
+        // lock mutex before accessing and modifying member variables
+        std::lock_guard<std::mutex> lock(m_mutex);
+        is_synced = (m_stream_op_id == m_last_sync);
+        if (!is_synced)) {
+            sync_op = m_stream_op_id;
+            m_free_wait_list.swap(new_empty_wait_list);
         }
-#else
-        AMREX_HIP_OR_CUDA( AMREX_HIP_SAFE_CALL(hipStreamSynchronize(m_stream));,
-                           AMREX_CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream)); )
-#endif
-        for (auto [arena, mem] : m_free_wait_list) {
+        // unlock mutex before stream sync and memory free
+        // to avoid deadlocks from the CArena mutex
+    }
+
+    if (!is_synced) {
+        Device::actualStreamSynchronize(stream)
+
+        // synconizing the stream may have taken a long time and
+        // there may be new kernels launched already, so we free memory and
+        // set m_last_sync according to the state from before the stream was synced
+
+        {
+            std::lock_guard<std::mutex> lock(m_mutex);
+            m_last_sync = sync_op;
+        }
+
+        for (auto [arena, mem] : new_empty_wait_list) {
             arena->free_now(mem);
         }
-        m_free_wait_list.clear();
     }
 }
 
 void
 StreamManager::internal_after_sync () {
-    std::lock_guard<std::mutex> lock(m_mutex);
 
-    m_is_synced = true;
-    for (auto [arena, mem] : m_free_wait_list) {
+    decltype(m_free_wait_list) new_empty_wait_list{};
+    {
+        // lock mutex before accessing and modifying member variables
+        std::lock_guard<std::mutex> lock(m_mutex);
+        m_last_sync = m_stream_op_id;
+        m_free_wait_list.swap(new_empty_wait_list);
+        // unlock mutex before memory free
+        // to avoid deadlocks from the CArena mutex
+    }
+
+    for (auto [arena, mem] : new_empty_wait_list) {
         arena->free_now(mem);
     }
-    m_free_wait_list.clear();
 }
 
 void
 StreamManager::stream_free (CArena* arena, void* mem) {
-    std::lock_guard<std::mutex> lock(m_mutex);
 
-    if (m_is_synced) {
+    bool is_synced = false;
+    {
+        // lock mutex before accessing and modifying member variables
+        std::lock_guard<std::mutex> lock(m_mutex);
+        is_synced = (m_stream_op_id == m_last_sync);
+        if (!is_synced) {
+            m_free_wait_list.emplace_back(arena, mem);
+        }
+        // unlock mutex before memory free
+        // to avoid deadlocks from the CArena mutex
+    }
+
+    if (is_synced) {
         arena->free_now(mem);
-    } else {
-        m_free_wait_list.emplace_back(arena, mem);
     }
 }
 
@@ -763,6 +790,47 @@ Device::streamSynchronizeAll () noexcept
 #endif
 }
 
+#ifdef AMREX_USE_GPU
+void
+Device::actualStreamSynchronize (gpuStream_t stream) noexcept
+{
+#if defined(AMReX_USE_CUDA)
+    cudaError_t amrex_i_err = cudaStreamSynchronize(stream);
+    if (cudaSuccess != amrex_i_err) {
+        std::string errStr(std::string("CUDA error from calling cudaStreamSynchronize ")
+            + std::to_string(amrex_i_err)
+            + std::string(" in file ") + __FILE__
+            + ": " + cudaGetErrorString(amrex_i_err)
+            + "This is likely caused by an issue in a previous kernel launch "
+            + "such as amrex::ParallelFor");
+        amrex::Abort(errStr);
+    }
+#elif defined(AMReX_USE_HIP)
+    hipError_t amrex_i_err = hipStreamSynchronize(stream);
+    if (hipSuccess != amrex_i_err) {
+        std::string errStr(std::string("HIP error from calling hipStreamSynchronize")
+            + std::string(" in file ") + __FILE__
+            + ": " + hipGetErrorString(amrex_i_err)
+            + "This is likely caused by an issue in a previous kernel launch "
+            + "such as amrex::ParallelFor"));
+        amrex::Abort(errStr);
+    }
+#elif defined(AMREX_USE_SYCL)
+    auto& q = *(stream.queue);
+    try {
+        q.wait_and_throw();
+    } catch (sycl::exception const& ex) {
+        std::string errStr(std::string("SYCL exception from calling queue.wait_and_throw")
+            + std::string(" in file ") + __FILE__
+            + ": " + ex.what() + "!!!!!"
+            + "This is likely caused by an issue in a previous kernel launch "
+            + "such as amrex::ParallelFor");
+        amrex::Abort(errStr);
+    }
+#endif
+}
+#endif
+
 #if defined(__CUDACC__) && defined(AMREX_USE_CUDA)
 
 void
diff --git a/Src/Base/AMReX_GpuElixir.cpp b/Src/Base/AMReX_GpuElixir.cpp
index f7c47578090..a7050c29e21 100644
--- a/Src/Base/AMReX_GpuElixir.cpp
+++ b/Src/Base/AMReX_GpuElixir.cpp
@@ -1,6 +1,7 @@
 
 #include <AMReX_GpuElixir.H>
 #include <AMReX_GpuDevice.H>
+#include <AMReX_CArena.H>
 #include <cstddef>
 #include <cstring>
 #include <cstdlib>
@@ -21,7 +22,12 @@ extern "C" {
     {
         auto p_pa = reinterpret_cast<Vector<std::pair<void*,Arena*> >*>(p);
         for (auto const& pa : *p_pa) {
-            pa.second->free(pa.first);
+            auto* p = dynamic_cast<amrex::CArena*>(pa.second);
+            if (p) {
+                p->free_now(pa.first);
+            } else {
+                pa.second->free(pa.first);
+            }
         }
         delete p_pa;
     }

From b76a1a37ba02f82f9fa14b56874351a2238068ac Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Sun, 11 May 2025 23:14:07 +0200
Subject: [PATCH 4/6] fix

---
 Src/Base/AMReX_GpuDevice.cpp | 4 ++--
 Src/Base/AMReX_GpuElixir.cpp | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index d69b4d23948..76d2996fb03 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -117,7 +117,7 @@ StreamManager::sync () {
         // lock mutex before accessing and modifying member variables
         std::lock_guard<std::mutex> lock(m_mutex);
         is_synced = (m_stream_op_id == m_last_sync);
-        if (!is_synced)) {
+        if (!is_synced) {
             sync_op = m_stream_op_id;
             m_free_wait_list.swap(new_empty_wait_list);
         }
@@ -126,7 +126,7 @@ StreamManager::sync () {
     }
 
     if (!is_synced) {
-        Device::actualStreamSynchronize(stream)
+        Device::actualStreamSynchronize(m_stream)
 
         // synconizing the stream may have taken a long time and
         // there may be new kernels launched already, so we free memory and
diff --git a/Src/Base/AMReX_GpuElixir.cpp b/Src/Base/AMReX_GpuElixir.cpp
index a7050c29e21..94450bc59b7 100644
--- a/Src/Base/AMReX_GpuElixir.cpp
+++ b/Src/Base/AMReX_GpuElixir.cpp
@@ -22,9 +22,9 @@ extern "C" {
     {
         auto p_pa = reinterpret_cast<Vector<std::pair<void*,Arena*> >*>(p);
         for (auto const& pa : *p_pa) {
-            auto* p = dynamic_cast<amrex::CArena*>(pa.second);
-            if (p) {
-                p->free_now(pa.first);
+            auto* carena = dynamic_cast<amrex::CArena*>(pa.second);
+            if (carena) {
+                carena->free_now(pa.first);
             } else {
                 pa.second->free(pa.first);
             }

From 705f6cca49aab620d5c8d3bfc346d977b4bf75fd Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Sun, 11 May 2025 23:34:33 +0200
Subject: [PATCH 5/6] fix 2

---
 Src/Base/AMReX_GpuDevice.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index 76d2996fb03..6693105fc86 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -126,7 +126,7 @@ StreamManager::sync () {
     }
 
     if (!is_synced) {
-        Device::actualStreamSynchronize(m_stream)
+        Device::actualStreamSynchronize(m_stream);
 
         // synconizing the stream may have taken a long time and
         // there may be new kernels launched already, so we free memory and
@@ -794,7 +794,7 @@ Device::streamSynchronizeAll () noexcept
 void
 Device::actualStreamSynchronize (gpuStream_t stream) noexcept
 {
-#if defined(AMReX_USE_CUDA)
+#if defined(AMREX_USE_CUDA)
     cudaError_t amrex_i_err = cudaStreamSynchronize(stream);
     if (cudaSuccess != amrex_i_err) {
         std::string errStr(std::string("CUDA error from calling cudaStreamSynchronize ")
@@ -805,7 +805,7 @@ Device::actualStreamSynchronize (gpuStream_t stream) noexcept
             + "such as amrex::ParallelFor");
         amrex::Abort(errStr);
     }
-#elif defined(AMReX_USE_HIP)
+#elif defined(AMREX_USE_HIP)
     hipError_t amrex_i_err = hipStreamSynchronize(stream);
     if (hipSuccess != amrex_i_err) {
         std::string errStr(std::string("HIP error from calling hipStreamSynchronize")

From b3e395064ccd9610958c9b6a84c1e73213acdaf1 Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Mon, 12 May 2025 00:08:56 +0200
Subject: [PATCH 6/6] fix 3

---
 Src/Base/AMReX_GpuDevice.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_GpuDevice.cpp b/Src/Base/AMReX_GpuDevice.cpp
index 6693105fc86..2d2573b7937 100644
--- a/Src/Base/AMReX_GpuDevice.cpp
+++ b/Src/Base/AMReX_GpuDevice.cpp
@@ -812,7 +812,7 @@ Device::actualStreamSynchronize (gpuStream_t stream) noexcept
             + std::string(" in file ") + __FILE__
             + ": " + hipGetErrorString(amrex_i_err)
             + "This is likely caused by an issue in a previous kernel launch "
-            + "such as amrex::ParallelFor"));
+            + "such as amrex::ParallelFor");
         amrex::Abort(errStr);
     }
 #elif defined(AMREX_USE_SYCL)