@@ -13,7 +13,11 @@ overwriting, so precompilation works normally.
1313Functions with MPI-specific behavior (via dispatch on `::Parallel`):
1414 _wallBC_L! — zero L at physical walls only (skip MPI-internal) + halo on L
1515 _exitBC! — global reductions for inflow/outflow mass flux
16- _divisible — stricter coarsening threshold (N>8) for multigrid
16+ _divisible — same coarsening threshold as serial (N>4)
17+
18+ Halo exchange uses a cached `_has_neighbors` flag to skip all exchange
19+ when no MPI neighbors exist (e.g. np=1 non-periodic), eliminating the
20+ overhead of IGG's `update_halo!` and buffer copies in that case.
1721"""
1822module WaterLilyMPIExt
1923
@@ -86,6 +90,7 @@ function WaterLily.init_waterlily_mpi(global_dims::NTuple{N}; perdir=()) where N
8690 )
8791
8892 WaterLily. par_mode[] = Parallel (comm)
93+ _init_has_neighbors! ()
8994
9095 if me == 0
9196 topo = join (string .(dims[1 : N]), " ×" )
103108# Number of active spatial dimensions (nxyz > 1) in the IGG grid.
104109_ndims_active () = sum (ImplicitGlobalGrid. global_grid (). nxyz .> 1 )
105110
111+ # True if any MPI neighbor exists in any active dimension (cached after init).
112+ const _has_neighbors = Ref (false )
113+ function _init_has_neighbors! ()
114+ g = ImplicitGlobalGrid. global_grid ()
115+ nd = _ndims_active ()
116+ _has_neighbors[] = any (g. neighbors[s, d] >= 0 for s in 1 : 2 , d in 1 : nd)
117+ end
118+
106119# ── Scalar halo exchange (fine grid — via IGG) ───────────────────────────────
107120
108121function _scalar_halo_igg! (arr:: AbstractArray )
@@ -154,11 +167,11 @@ function _scalar_halo_mpi!(arr::AbstractArray{T}) where T
154167 slab_shape = ntuple (i -> i == dim ? 2 : N[i], ndims (arr))
155168 send_left, recv_left, send_right, recv_right = _get_mpi_bufs (T, slab_shape, dim)
156169
157- # Pack send buffers using contiguous slab views
170+ # Pack send buffers
158171 copyto! (send_left, _slab (arr, dim, 3 : 4 ))
159172 copyto! (send_right, _slab (arr, dim, N[dim]- 3 : N[dim]- 2 ))
160173
161- # Post all sends/recvs
174+ # Post all non-blocking sends/recvs, then Waitall for max overlap
162175 nreqs = 0
163176 if nright >= 0
164177 nreqs += 1 ; _mpi_reqs[nreqs] = MPI. Isend (send_right, comm; dest= nright, tag= dim* 10 )
@@ -189,6 +202,7 @@ function _is_fine(arr::AbstractArray)
189202end
190203
191204function _do_scalar_halo! (arr:: AbstractArray )
205+ _has_neighbors[] || return
192206 if _is_fine (arr)
193207 _scalar_halo_igg! (arr)
194208 else
@@ -205,6 +219,7 @@ function _get_halo_buf(::Type{T}, dims::NTuple{N,Int}) where {T,N}
205219end
206220
207221function _do_velocity_halo! (u:: AbstractArray{T,N} ) where {T,N}
222+ _has_neighbors[] || return
208223 D = size (u, N) # number of components (last dim)
209224 sp = ntuple (_ -> :, N- 1 ) # all spatial dims as Colons
210225 sdims = size (u)[1 : N- 1 ] # spatial dimensions
@@ -284,7 +299,10 @@ function WaterLily._wallBC_L!(L, perdir, ::Parallel)
284299end
285300
286301# ── MPI-aware divisible ───────────────────────────────────────────────────────
302+ # Same threshold as serial (N>4). Coarse-level comm cost is negligible thanks
303+ # to `_has_neighbors` short-circuiting (no exchange when no MPI neighbors exist)
304+ # and tiny array sizes at the coarsest levels.
287305
288- WaterLily. _divisible (N, :: Parallel ) = mod (N,2 )== 0 && N> 8
306+ WaterLily. _divisible (N, :: Parallel ) = mod (N,2 )== 0 && N> 4
289307
290308end # module WaterLilyMPIExt
0 commit comments