AMReX-Codes
diff --git a/‎CHANGES.md‎
Lines changed: 62 additions & 0 deletions b/‎CHANGES.md‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎Src/Base/AMReX_GpuDevice.H‎
Lines changed: 46 additions & 8 deletions b/‎Src/Base/AMReX_GpuDevice.H‎
Lines changed: 46 additions & 8 deletions
@@ -1,3 +1,65 @@
+# 25.12
+
+ ## Highlights:
+
+  * There is a new overload of partitionParticles that takes num_left as an
+    input to skip the reduction that would compute num_left in the original
+    function. This can be useful when combining the reduction with other
+    operations in an effort to reduce the overhead from extra kernel
+    launches and stream synchronizations.
+
+  * We can now set the default value of amrex.the_arena_init_size with an
+    environment variable, AMREX_THE_ARENA_INIT_SIZE. This is convenient for
+    CI jobs.
+
+  * Fix restart w/ out-of-bounds Particles. Seen on Frontier at 6000 nodes.
+    If the particle locator decides that a particle is out-of-bounds, it
+    used an inconsistent level for the particle in restart. Now, it uses the
+    currently loaded level consistently, with an invalid-marked tile.
+
+  * `PC::make_alike<Allocator>` changes the template default of
+    `make_alike<>()` to use the same allocator as the creating
+    allocator. This is a breaking change.
+
+  * Add some extra room when we call `PODVector::resize` and `reserve`. By
+    default, the extra capacity is computed as 3*sqrt(capacity), and is
+    capped at 10%. Other strategies can be specified with the GrowthStrategy
+    argument to PODVector resize and reserve. This helps particle codes
+    avoid memory re-allocation.
+
+ ## Other major changes:
+
+  * Fix compile error with Conduit + Particles (#4813)
+
+  * Fix loop bounds in selectActualNeighbors (#4809)
+
+  * RNG on GPU: Assertion it's not in OMP parallel region (#4799)
+
+  * Add int overflow assert to PrefixSum (#4794)
+
+  * Add index and size information to Vector assertion message (#4790)
+
+  * Add Amr::derive overloads for all levels (#4780)
+
+  * ParticleContainerToBlueprint: Allocator (#4776)
+
+  * Add amrex::Math::rsqrt (#4777)
+
+  * AMREX_ENUM: Fix enumerator = int (#4766)
+
+  * Make htod_memcpy_async available on CPU (#4640)
+
+  * Add ParmParse::Add for AMREX_NUM (#4765)
+
+  * use atomic add in SRD algorithm (#4754)
+    Refactor MLStateRedist to run faster when many cells have no nbors (#4742)
+
+  * amrex::Initialize: Add optional argument of device ID (#4741)
+
+  * Minor optimization of ReduceToPlaneMF2 (#4745)
+
+  * Fix PODVector shrink_to_fit() with nonzero size (#4748)
+
 # 25.11
 
  ## Highlights:
 
@@ -16,6 +16,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <memory>
+#include <mutex>
 
 #define AMREX_GPU_MAX_STREAMS 8
 
@@ -46,8 +47,24 @@ using gpuDeviceProp_t = cudaDeviceProp;
 }
 #endif
 
+namespace amrex {
+    class Arena;
+}
+
 namespace amrex::Gpu {
 
+#ifdef AMREX_USE_GPU
+class StreamManager {
+    gpuStream_t m_stream;
+    std::mutex m_mutex;
+    Vector<std::pair<Arena*, void*>> m_free_wait_list;
+public:
+    [[nodiscard]] gpuStream_t& get ();
+    void sync ();
+    void free_async (Arena* arena, void* mem);
+};
+#endif
+
 class Device
 {
 
@@ -57,14 +74,16 @@ public:
     static void Finalize ();
 
 #if defined(AMREX_USE_GPU)
-    static gpuStream_t gpuStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
+    static gpuStream_t gpuStream () noexcept {
+        return gpu_stream_pool[gpu_stream_index[OpenMP::get_thread_num()]].get();
+    }
 #ifdef AMREX_USE_CUDA
     /** for backward compatibility */
-    static cudaStream_t cudaStream () noexcept { return gpu_stream[OpenMP::get_thread_num()]; }
+    static cudaStream_t cudaStream () noexcept { return gpuStream(); }
 #endif
 #ifdef AMREX_USE_SYCL
-    static sycl::queue& streamQueue () noexcept { return *(gpu_stream[OpenMP::get_thread_num()].queue); }
-    static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].queue); }
+    static sycl::queue& streamQueue () noexcept { return *(gpuStream().queue); }
+    static sycl::queue& streamQueue (int i) noexcept { return *(gpu_stream_pool[i].get().queue); }
 #endif
 #endif
 
@@ -104,6 +123,8 @@ public:
      */
     static void streamSynchronizeAll () noexcept;
 
+    static void freeAsync (Arena* arena, void* mem) noexcept;
+
 #if defined(__CUDACC__)
     /**  Generic graph selection. These should be called by users.  */
     static void startGraphRecording(bool first_iter, void* h_ptr, void* d_ptr, size_t sz);
@@ -196,10 +217,10 @@ private:
     static AMREX_EXPORT dim3 numThreadsMin;
     static AMREX_EXPORT dim3 numBlocksOverride, numThreadsOverride;
 
-    static AMREX_EXPORT Vector<gpuStream_t> gpu_stream_pool; // The size of this is max_gpu_stream
-    // The non-owning gpu_stream is used to store the current stream that will be used.
-    // gpu_stream is a vector so that it's thread safe to write to it.
-    static AMREX_EXPORT Vector<gpuStream_t> gpu_stream; // The size of this is omp_max_threads
+    static AMREX_EXPORT Vector<StreamManager> gpu_stream_pool; // The size of this is max_gpu_stream
+    // The non-owning gpu_stream_index is used to store the current stream index that will be used.
+    // gpu_stream_index is a vector so that it's thread safe to write to it.
+    static AMREX_EXPORT Vector<int> gpu_stream_index; // The size of this is omp_max_threads
     static AMREX_EXPORT gpuDeviceProp_t device_prop;
     static AMREX_EXPORT int memory_pools_supported;
     static AMREX_EXPORT unsigned int max_blocks_per_launch;
@@ -208,6 +229,8 @@ private:
     static AMREX_EXPORT std::unique_ptr<sycl::context> sycl_context;
     static AMREX_EXPORT std::unique_ptr<sycl::device>  sycl_device;
 #endif
+
+    friend StreamManager;
 #endif
 };
 
@@ -245,6 +268,21 @@ streamSynchronizeAll () noexcept
     Device::streamSynchronizeAll();
 }
 
+/** Deallocate memory belonging to an arena asynchronously.
+ * Memory deallocated in this way is held in a pool and will not be reused until
+ * the next amrex::Gpu::streamSynchronize(). GPU kernels that were already launched on the
+ * currently active stream can still continue to use the memory after this function is called.
+ * There is no need to use this function for CPU-only memory or with The_Async_Arena.
+ *
+ * \param[in] arena the arena the memory belongs to
+ * \param[in] mem pointer to the memory to be freed
+ */
+inline void
+freeAsync (Arena* arena, void* mem) noexcept
+{
+    Device::freeAsync(arena, mem);
+}
+
 #ifdef AMREX_USE_GPU
 
 inline void