Skip to content

Commit 39f87b7

Browse files
committed
Optimize MPI halo exchange and multigrid coarsening
- Add _has_neighbors flag to skip halo exchange when no MPI neighbors exist (e.g. np=1 non-periodic), eliminating IGG update_halo! overhead and unnecessary buffer copies - Precompute global_length(inside(x)) in Poisson struct (inslen field) to avoid one MPI Allreduce per residual! call - Change parallel _divisible threshold from N>8 to N>4 (same as serial) since coarse-level comm cost is negligible with the _has_neighbors short-circuit
1 parent 93787dc commit 39f87b7

File tree

3 files changed

+25
-7
lines changed

3 files changed

+25
-7
lines changed

ext/WaterLilyMPIExt.jl

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,11 @@ overwriting, so precompilation works normally.
1313
Functions with MPI-specific behavior (via dispatch on `::Parallel`):
1414
_wallBC_L! — zero L at physical walls only (skip MPI-internal) + halo on L
1515
_exitBC! — global reductions for inflow/outflow mass flux
16-
_divisible — stricter coarsening threshold (N>8) for multigrid
16+
_divisible — same coarsening threshold as serial (N>4)
17+
18+
Halo exchange uses a cached `_has_neighbors` flag to skip all exchange
19+
when no MPI neighbors exist (e.g. np=1 non-periodic), eliminating the
20+
overhead of IGG's `update_halo!` and buffer copies in that case.
1721
"""
1822
module WaterLilyMPIExt
1923

@@ -86,6 +90,7 @@ function WaterLily.init_waterlily_mpi(global_dims::NTuple{N}; perdir=()) where N
8690
)
8791

8892
WaterLily.par_mode[] = Parallel(comm)
93+
_init_has_neighbors!()
8994

9095
if me == 0
9196
topo = join(string.(dims[1:N]), "×")
@@ -103,6 +108,14 @@ end
103108
# Number of active spatial dimensions (nxyz > 1) in the IGG grid.
104109
_ndims_active() = sum(ImplicitGlobalGrid.global_grid().nxyz .> 1)
105110

111+
# True if any MPI neighbor exists in any active dimension (cached after init).
112+
const _has_neighbors = Ref(false)
113+
function _init_has_neighbors!()
114+
g = ImplicitGlobalGrid.global_grid()
115+
nd = _ndims_active()
116+
_has_neighbors[] = any(g.neighbors[s, d] >= 0 for s in 1:2, d in 1:nd)
117+
end
118+
106119
# ── Scalar halo exchange (fine grid — via IGG) ───────────────────────────────
107120

108121
function _scalar_halo_igg!(arr::AbstractArray)
@@ -154,11 +167,11 @@ function _scalar_halo_mpi!(arr::AbstractArray{T}) where T
154167
slab_shape = ntuple(i -> i == dim ? 2 : N[i], ndims(arr))
155168
send_left, recv_left, send_right, recv_right = _get_mpi_bufs(T, slab_shape, dim)
156169

157-
# Pack send buffers using contiguous slab views
170+
# Pack send buffers
158171
copyto!(send_left, _slab(arr, dim, 3:4))
159172
copyto!(send_right, _slab(arr, dim, N[dim]-3:N[dim]-2))
160173

161-
# Post all sends/recvs
174+
# Post all non-blocking sends/recvs, then Waitall for max overlap
162175
nreqs = 0
163176
if nright >= 0
164177
nreqs += 1; _mpi_reqs[nreqs] = MPI.Isend(send_right, comm; dest=nright, tag=dim*10)
@@ -189,6 +202,7 @@ function _is_fine(arr::AbstractArray)
189202
end
190203

191204
function _do_scalar_halo!(arr::AbstractArray)
205+
_has_neighbors[] || return
192206
if _is_fine(arr)
193207
_scalar_halo_igg!(arr)
194208
else
@@ -205,6 +219,7 @@ function _get_halo_buf(::Type{T}, dims::NTuple{N,Int}) where {T,N}
205219
end
206220

207221
function _do_velocity_halo!(u::AbstractArray{T,N}) where {T,N}
222+
_has_neighbors[] || return
208223
D = size(u, N) # number of components (last dim)
209224
sp = ntuple(_ -> :, N-1) # all spatial dims as Colons
210225
sdims = size(u)[1:N-1] # spatial dimensions
@@ -284,7 +299,10 @@ function WaterLily._wallBC_L!(L, perdir, ::Parallel)
284299
end
285300

286301
# ── MPI-aware divisible ───────────────────────────────────────────────────────
302+
# Same threshold as serial (N>4). Coarse-level comm cost is negligible thanks
303+
# to `_has_neighbors` short-circuiting (no exchange when no MPI neighbors exist)
304+
# and tiny array sizes at the coarsest levels.
287305

288-
WaterLily._divisible(N, ::Parallel) = mod(N,2)==0 && N>8
306+
WaterLily._divisible(N, ::Parallel) = mod(N,2)==0 && N>4
289307

290308
end # module WaterLilyMPIExt

src/Poisson.jl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,13 @@ struct Poisson{T,S<:AbstractArray{T},V<:AbstractArray{T}} <: AbstractPoisson{T,S
3636
z :: S # source
3737
n :: Vector{Int16} # pressure solver iterations
3838
perdir :: NTuple # direction of periodic boundary condition
39+
inslen :: Int # global number of inside cells (precomputed to avoid Allreduce)
3940
function Poisson(x::AbstractArray{T},L::AbstractArray{T},z::AbstractArray{T};perdir=()) where T
4041
@assert axes(x) == axes(z) && axes(x) == Base.front(axes(L)) && last(axes(L)) == eachindex(axes(x))
4142
r = similar(x); fill!(r,0)
4243
ϵ,D,iD = copy(r),copy(r),copy(r)
4344
set_diag!(D,iD,L)
44-
new{T,typeof(x),typeof(L)}(L,D,iD,x,ϵ,r,z,[],perdir)
45+
new{T,typeof(x),typeof(L)}(L,D,iD,x,ϵ,r,z,[],perdir,global_length(inside(x)))
4546
end
4647
end
4748

@@ -99,7 +100,7 @@ without the corrections, no solution exists.
99100
function residual!(p::Poisson)
100101
comm!(p.x,p.perdir)
101102
@inside p.r[I] = ifelse(p.iD[I]==0,0,p.z[I]-mult(I,p.L,p.D,p.x))
102-
s = global_sum(p.r)/global_length(inside(p.r))
103+
s = global_sum(p.r)/p.inslen
103104
abs(s) <= 2eps(eltype(s)) && return
104105
@inside p.r[I] = p.r[I]-s
105106
end

src/util.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -321,7 +321,6 @@ end
321321
divisible(N)
322322
323323
Check if array dimension `N` is divisible for multigrid coarsening.
324-
MPI extension requires stricter threshold (N>8).
325324
"""
326325
divisible(N) = _divisible(N, par_mode[])
327326
_divisible(N, ::Serial) = mod(N,2)==0 && N>4

0 commit comments

Comments
 (0)