Add option to remove synchronization between interact! kernels on GPUs

efaulhaber · efaulhaber · commit 8d4ab9f89dc8 · 2026-03-04T17:32:56.000+01:00
diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
@@ -41,8 +41,8 @@ using TrixiBase: @trixi_timeit, timer, timeit_debug_enabled,
                                 FullGridCellList, DictionaryCellList,
                                 SerialBackend, PolyesterBackend, ThreadsStaticBackend,
                                 ThreadsDynamicBackend, default_backend
-using PointNeighbors: PointNeighbors, foreach_point_neighbor, copy_neighborhood_search,
-                      @threaded
+using PointNeighbors: PointNeighbors, foreach_point_neighbor, foreach_neighbor,
+                      copy_neighborhood_search, @threaded, @threaded_nosync
 using WriteVTK: vtk_grid, MeshCell, VTKCellTypes, paraview_collection, vtk_save
 
 # `util.jl` needs to be first because of the macros `@trixi_timeit` and `@threaded`
diff --git a/src/general/gpu.jl b/src/general/gpu.jl
@@ -24,24 +24,29 @@ Adapt.@adapt_structure DEMSystem
 Adapt.@adapt_structure BoundaryDEMSystem
 Adapt.@adapt_structure RCRWindkesselModel
 
-KernelAbstractions.get_backend(::PtrArray) = KernelAbstractions.CPU()
-function KernelAbstractions.get_backend(system::AbstractSystem)
-    KernelAbstractions.get_backend(system.mass)
-end
-
-function KernelAbstractions.get_backend(system::WallBoundarySystem)
-    KernelAbstractions.get_backend(system.coordinates)
-end
-
 # This makes `@threaded semi for ...` use `semi.parallelization_backend` for parallelization
 @inline function PointNeighbors.parallel_foreach(f, iterator, semi::Semidiscretization)
     PointNeighbors.parallel_foreach(f, iterator, semi.parallelization_backend)
 end
 
+# Same with `@threaded_nosync`
+@inline function PointNeighbors.parallel_foreach_nosync(f, iterator,
+                                                        semi::Semidiscretization)
+    PointNeighbors.parallel_foreach_nosync(f, iterator, semi.parallelization_backend)
+end
+
 function allocate(backend::KernelAbstractions.GPU, ELTYPE, size)
     return KernelAbstractions.allocate(backend, ELTYPE, size)
 end
 
 function allocate(backend, ELTYPE, size)
     return Array{ELTYPE, length(size)}(undef, size)
 end
+
+@inline function synchronize_backend(backend::KernelAbstractions.GPU)
+    return KernelAbstractions.synchronize(backend)
+end
+
+@inline function synchronize_backend(backend)
+    return nothing
+end
diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
@@ -49,7 +49,7 @@ semi = Semidiscretization(fluid_system, boundary_system,
 └──────────────────────────────────────────────────────────────────────────────────────────────────┘
 ```
 """
-struct Semidiscretization{BACKEND, S, RU, RV, NS, UCU, IT}
+struct Semidiscretization{TIMERS, S, RU, RV, NS, BACKEND, UCU, IT}
     systems                 :: S
     ranges_u                :: RU
     ranges_v                :: RV
@@ -128,6 +128,11 @@ end
 
 struct IndividualTimers end
 struct CombinedTimers end
+struct NoTimers end
+
+# Individual timers are usually not worth the overhead on GPUs
+default_timers(parallelization_backend) = IndividualTimers()
+default_timers(::KernelAbstractions.GPU) = CombinedTimers()
 
 # Inline show function e.g. Semidiscretization(neighborhood_search=...)
 function Base.show(io::IO, semi::Semidiscretization)
@@ -702,6 +707,21 @@ function system_interaction!(dv_ode, v_ode, u_ode,
     return dv_ode
 end
 
+function system_interaction!(dv_ode, v_ode, u_ode,
+                             semi::Semidiscretization{NoTimers})
+    foreach_system(semi) do system
+        # Call a combined `interact!` for all interactions of this system with other systems.
+        # Since no timers are used, we can avoid synchronization, as each combined
+        # interaction will only write to the part of `dv_ode` corresponding its system.
+        interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=false)
+    end
+
+    # Now manually synchronize, since we disabled synchronization above
+    synchronize_backend(semi.parallelization_backend)
+
+    return dv_ode
+end
+
 # Function barrier to make benchmarking interactions easier.
 # One can benchmark, e.g. the fluid-fluid interaction, with:
 # dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x;
@@ -744,14 +764,18 @@ function interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=true
 
     # Loop over all particles that are integrated for this system, i.e., all particles
     # for which `dv` has entries.
-    @threaded semi for particle in each_integrated_particle(system)
+    # `@threaded_nosync` is the same as `@threaded` but without synchronization on GPUs.
+    # Manual synchronization is done below.
+    @threaded_nosync semi for particle in each_integrated_particle(system)
         # Now loop over all neighbor systems to avoid separate loops/kernels
         # for each pair of systems.
         foreach_noalloc(iterator) do (neighbor, v_neighbor, u_neighbor, nhs)
             interact!(dv, v_system, u_system, v_neighbor, u_neighbor,
                       system, neighbor, semi, nhs, particle)
         end
     end
+
+    synchronize && synchronize_backend(semi.parallelization_backend)
 end
 
 function check_update_callback(semi)