@@ -49,7 +49,7 @@ semi = Semidiscretization(fluid_system, boundary_system,
4949└──────────────────────────────────────────────────────────────────────────────────────────────────┘
5050```
5151"""
52- struct Semidiscretization{BACKEND , S, RU, RV, NS, UCU, IT}
52+ struct Semidiscretization{TIMERS , S, RU, RV, NS, BACKEND , UCU, IT}
5353 systems :: S
5454 ranges_u :: RU
5555 ranges_v :: RV
128128
129129struct IndividualTimers end
130130struct CombinedTimers end
131+ struct NoTimers end
132+
133+ # Individual timers are usually not worth the overhead on GPUs
134+ default_timers (parallelization_backend) = IndividualTimers ()
135+ default_timers (:: KernelAbstractions.GPU ) = CombinedTimers ()
131136
132137# Inline show function e.g. Semidiscretization(neighborhood_search=...)
133138function Base. show (io:: IO , semi:: Semidiscretization )
@@ -702,6 +707,21 @@ function system_interaction!(dv_ode, v_ode, u_ode,
702707 return dv_ode
703708end
704709
710+ function system_interaction! (dv_ode, v_ode, u_ode,
711+ semi:: Semidiscretization{NoTimers} )
712+ foreach_system (semi) do system
713+ # Call a combined `interact!` for all interactions of this system with other systems.
714+ # Since no timers are used, we can avoid synchronization, as each combined
715+ # interaction will only write to the part of `dv_ode` corresponding its system.
716+ interact_combined! (dv_ode, v_ode, u_ode, system, semi; synchronize= false )
717+ end
718+
719+ # Now manually synchronize, since we disabled synchronization above
720+ synchronize_backend (semi. parallelization_backend)
721+
722+ return dv_ode
723+ end
724+
705725# Function barrier to make benchmarking interactions easier.
706726# One can benchmark, e.g. the fluid-fluid interaction, with:
707727# dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x;
@@ -744,14 +764,18 @@ function interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=true
744764
745765 # Loop over all particles that are integrated for this system, i.e., all particles
746766 # for which `dv` has entries.
747- @threaded semi for particle in each_integrated_particle (system)
767+ # `@threaded_nosync` is the same as `@threaded` but without synchronization on GPUs.
768+ # Manual synchronization is done below.
769+ @threaded_nosync semi for particle in each_integrated_particle (system)
748770 # Now loop over all neighbor systems to avoid separate loops/kernels
749771 # for each pair of systems.
750772 foreach_noalloc (iterator) do (neighbor, v_neighbor, u_neighbor, nhs)
751773 interact! (dv, v_system, u_system, v_neighbor, u_neighbor,
752774 system, neighbor, semi, nhs, particle)
753775 end
754776 end
777+
778+ synchronize && synchronize_backend (semi. parallelization_backend)
755779end
756780
757781function check_update_callback (semi)
0 commit comments