Skip to content

Commit 8d4ab9f

Browse files
committed
Add option to remove synchronization between interact! kernels on GPUs
1 parent c06d032 commit 8d4ab9f

File tree

3 files changed

+42
-13
lines changed

3 files changed

+42
-13
lines changed

src/TrixiParticles.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ using TrixiBase: @trixi_timeit, timer, timeit_debug_enabled,
4141
FullGridCellList, DictionaryCellList,
4242
SerialBackend, PolyesterBackend, ThreadsStaticBackend,
4343
ThreadsDynamicBackend, default_backend
44-
using PointNeighbors: PointNeighbors, foreach_point_neighbor, copy_neighborhood_search,
45-
@threaded
44+
using PointNeighbors: PointNeighbors, foreach_point_neighbor, foreach_neighbor,
45+
copy_neighborhood_search, @threaded, @threaded_nosync
4646
using WriteVTK: vtk_grid, MeshCell, VTKCellTypes, paraview_collection, vtk_save
4747

4848
# `util.jl` needs to be first because of the macros `@trixi_timeit` and `@threaded`

src/general/gpu.jl

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,24 +24,29 @@ Adapt.@adapt_structure DEMSystem
2424
Adapt.@adapt_structure BoundaryDEMSystem
2525
Adapt.@adapt_structure RCRWindkesselModel
2626

27-
KernelAbstractions.get_backend(::PtrArray) = KernelAbstractions.CPU()
28-
function KernelAbstractions.get_backend(system::AbstractSystem)
29-
KernelAbstractions.get_backend(system.mass)
30-
end
31-
32-
function KernelAbstractions.get_backend(system::WallBoundarySystem)
33-
KernelAbstractions.get_backend(system.coordinates)
34-
end
35-
3627
# This makes `@threaded semi for ...` use `semi.parallelization_backend` for parallelization
3728
@inline function PointNeighbors.parallel_foreach(f, iterator, semi::Semidiscretization)
3829
PointNeighbors.parallel_foreach(f, iterator, semi.parallelization_backend)
3930
end
4031

32+
# Same with `@threaded_nosync`
33+
@inline function PointNeighbors.parallel_foreach_nosync(f, iterator,
34+
semi::Semidiscretization)
35+
PointNeighbors.parallel_foreach_nosync(f, iterator, semi.parallelization_backend)
36+
end
37+
4138
function allocate(backend::KernelAbstractions.GPU, ELTYPE, size)
4239
return KernelAbstractions.allocate(backend, ELTYPE, size)
4340
end
4441

4542
function allocate(backend, ELTYPE, size)
4643
return Array{ELTYPE, length(size)}(undef, size)
4744
end
45+
46+
@inline function synchronize_backend(backend::KernelAbstractions.GPU)
47+
return KernelAbstractions.synchronize(backend)
48+
end
49+
50+
@inline function synchronize_backend(backend)
51+
return nothing
52+
end

src/general/semidiscretization.jl

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ semi = Semidiscretization(fluid_system, boundary_system,
4949
└──────────────────────────────────────────────────────────────────────────────────────────────────┘
5050
```
5151
"""
52-
struct Semidiscretization{BACKEND, S, RU, RV, NS, UCU, IT}
52+
struct Semidiscretization{TIMERS, S, RU, RV, NS, BACKEND, UCU, IT}
5353
systems :: S
5454
ranges_u :: RU
5555
ranges_v :: RV
@@ -128,6 +128,11 @@ end
128128

129129
struct IndividualTimers end
130130
struct CombinedTimers end
131+
struct NoTimers end
132+
133+
# Individual timers are usually not worth the overhead on GPUs
134+
default_timers(parallelization_backend) = IndividualTimers()
135+
default_timers(::KernelAbstractions.GPU) = CombinedTimers()
131136

132137
# Inline show function e.g. Semidiscretization(neighborhood_search=...)
133138
function Base.show(io::IO, semi::Semidiscretization)
@@ -702,6 +707,21 @@ function system_interaction!(dv_ode, v_ode, u_ode,
702707
return dv_ode
703708
end
704709

710+
function system_interaction!(dv_ode, v_ode, u_ode,
711+
semi::Semidiscretization{NoTimers})
712+
foreach_system(semi) do system
713+
# Call a combined `interact!` for all interactions of this system with other systems.
714+
# Since no timers are used, we can avoid synchronization, as each combined
715+
# interaction will only write to the part of `dv_ode` corresponding its system.
716+
interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=false)
717+
end
718+
719+
# Now manually synchronize, since we disabled synchronization above
720+
synchronize_backend(semi.parallelization_backend)
721+
722+
return dv_ode
723+
end
724+
705725
# Function barrier to make benchmarking interactions easier.
706726
# One can benchmark, e.g. the fluid-fluid interaction, with:
707727
# dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x;
@@ -744,14 +764,18 @@ function interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=true
744764

745765
# Loop over all particles that are integrated for this system, i.e., all particles
746766
# for which `dv` has entries.
747-
@threaded semi for particle in each_integrated_particle(system)
767+
# `@threaded_nosync` is the same as `@threaded` but without synchronization on GPUs.
768+
# Manual synchronization is done below.
769+
@threaded_nosync semi for particle in each_integrated_particle(system)
748770
# Now loop over all neighbor systems to avoid separate loops/kernels
749771
# for each pair of systems.
750772
foreach_noalloc(iterator) do (neighbor, v_neighbor, u_neighbor, nhs)
751773
interact!(dv, v_system, u_system, v_neighbor, u_neighbor,
752774
system, neighbor, semi, nhs, particle)
753775
end
754776
end
777+
778+
synchronize && synchronize_backend(semi.parallelization_backend)
755779
end
756780

757781
function check_update_callback(semi)

0 commit comments

Comments
 (0)