Optimize MPI halo exchange and multigrid coarsening

b-fg · b-fg · commit 39f87b7fb015 · 2026-04-15T14:33:00.000+02:00
- Add _has_neighbors flag to skip halo exchange when no MPI neighbors
    exist (e.g. np=1 non-periodic), eliminating IGG update_halo! overhead
    and unnecessary buffer copies
  - Precompute global_length(inside(x)) in Poisson struct (inslen field)
    to avoid one MPI Allreduce per residual! call
  - Change parallel _divisible threshold from N&gt;8 to N&gt;4 (same as serial)
    since coarse-level comm cost is negligible with the _has_neighbors
    short-circuit
diff --git a/ext/WaterLilyMPIExt.jl b/ext/WaterLilyMPIExt.jl
@@ -13,7 +13,11 @@ overwriting, so precompilation works normally.
 Functions with MPI-specific behavior (via dispatch on `::Parallel`):
   _wallBC_L!  — zero L at physical walls only (skip MPI-internal) + halo on L
   _exitBC!    — global reductions for inflow/outflow mass flux
-  _divisible  — stricter coarsening threshold (N>8) for multigrid
+  _divisible  — same coarsening threshold as serial (N>4)
+
+Halo exchange uses a cached `_has_neighbors` flag to skip all exchange
+when no MPI neighbors exist (e.g. np=1 non-periodic), eliminating the
+overhead of IGG's `update_halo!` and buffer copies in that case.
 """
 module WaterLilyMPIExt
 
@@ -86,6 +90,7 @@ function WaterLily.init_waterlily_mpi(global_dims::NTuple{N}; perdir=()) where N
     )
 
     WaterLily.par_mode[] = Parallel(comm)
+    _init_has_neighbors!()
 
     if me == 0
         topo = join(string.(dims[1:N]), "×")
@@ -103,6 +108,14 @@ end
 # Number of active spatial dimensions (nxyz > 1) in the IGG grid.
 _ndims_active() = sum(ImplicitGlobalGrid.global_grid().nxyz .> 1)
 
+# True if any MPI neighbor exists in any active dimension (cached after init).
+const _has_neighbors = Ref(false)
+function _init_has_neighbors!()
+    g = ImplicitGlobalGrid.global_grid()
+    nd = _ndims_active()
+    _has_neighbors[] = any(g.neighbors[s, d] >= 0 for s in 1:2, d in 1:nd)
+end
+
 # ── Scalar halo exchange (fine grid — via IGG) ───────────────────────────────
 
 function _scalar_halo_igg!(arr::AbstractArray)
@@ -154,11 +167,11 @@ function _scalar_halo_mpi!(arr::AbstractArray{T}) where T
         slab_shape = ntuple(i -> i == dim ? 2 : N[i], ndims(arr))
         send_left, recv_left, send_right, recv_right = _get_mpi_bufs(T, slab_shape, dim)
 
-        # Pack send buffers using contiguous slab views
+        # Pack send buffers
         copyto!(send_left,  _slab(arr, dim, 3:4))
         copyto!(send_right, _slab(arr, dim, N[dim]-3:N[dim]-2))
 
-        # Post all sends/recvs
+        # Post all non-blocking sends/recvs, then Waitall for max overlap
         nreqs = 0
         if nright >= 0
             nreqs += 1; _mpi_reqs[nreqs] = MPI.Isend(send_right, comm; dest=nright, tag=dim*10)
@@ -189,6 +202,7 @@ function _is_fine(arr::AbstractArray)
 end
 
 function _do_scalar_halo!(arr::AbstractArray)
+    _has_neighbors[] || return
     if _is_fine(arr)
         _scalar_halo_igg!(arr)
     else
@@ -205,6 +219,7 @@ function _get_halo_buf(::Type{T}, dims::NTuple{N,Int}) where {T,N}
 end
 
 function _do_velocity_halo!(u::AbstractArray{T,N}) where {T,N}
+    _has_neighbors[] || return
     D   = size(u, N)                    # number of components (last dim)
     sp  = ntuple(_ -> :, N-1)           # all spatial dims as Colons
     sdims = size(u)[1:N-1]              # spatial dimensions
@@ -284,7 +299,10 @@ function WaterLily._wallBC_L!(L, perdir, ::Parallel)
 end
 
 # ── MPI-aware divisible ───────────────────────────────────────────────────────
+# Same threshold as serial (N>4). Coarse-level comm cost is negligible thanks
+# to `_has_neighbors` short-circuiting (no exchange when no MPI neighbors exist)
+# and tiny array sizes at the coarsest levels.
 
-WaterLily._divisible(N, ::Parallel) = mod(N,2)==0 && N>8
+WaterLily._divisible(N, ::Parallel) = mod(N,2)==0 && N>4
 
 end # module WaterLilyMPIExt
diff --git a/src/Poisson.jl b/src/Poisson.jl
@@ -36,12 +36,13 @@ struct Poisson{T,S<:AbstractArray{T},V<:AbstractArray{T}} <: AbstractPoisson{T,S
     z :: S # source
     n :: Vector{Int16} # pressure solver iterations
     perdir :: NTuple # direction of periodic boundary condition
+    inslen :: Int # global number of inside cells (precomputed to avoid Allreduce)
     function Poisson(x::AbstractArray{T},L::AbstractArray{T},z::AbstractArray{T};perdir=()) where T
         @assert axes(x) == axes(z) && axes(x) == Base.front(axes(L)) && last(axes(L)) == eachindex(axes(x))
         r = similar(x); fill!(r,0)
         ϵ,D,iD = copy(r),copy(r),copy(r)
         set_diag!(D,iD,L)
-        new{T,typeof(x),typeof(L)}(L,D,iD,x,ϵ,r,z,[],perdir)
+        new{T,typeof(x),typeof(L)}(L,D,iD,x,ϵ,r,z,[],perdir,global_length(inside(x)))
     end
 end
 
@@ -99,7 +100,7 @@ without the corrections, no solution exists.
 function residual!(p::Poisson)
     comm!(p.x,p.perdir)
     @inside p.r[I] = ifelse(p.iD[I]==0,0,p.z[I]-mult(I,p.L,p.D,p.x))
-    s = global_sum(p.r)/global_length(inside(p.r))
+    s = global_sum(p.r)/p.inslen
     abs(s) <= 2eps(eltype(s)) && return
     @inside p.r[I] = p.r[I]-s
 end
diff --git a/src/util.jl b/src/util.jl
@@ -321,7 +321,6 @@ end
     divisible(N)
 
 Check if array dimension `N` is divisible for multigrid coarsening.
-MPI extension requires stricter threshold (N>8).
 """
 divisible(N) = _divisible(N, par_mode[])
 _divisible(N, ::Serial) = mod(N,2)==0 && N>4