diff --git a/Project.toml b/Project.toml index 5cb39cf0..9b1560a4 100644 --- a/Project.toml +++ b/Project.toml @@ -8,31 +8,35 @@ ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9" DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" EllipsisNotation = "da5c29d0-fa7d-589e-88eb-ea29b0a81949" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36" Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" Requires = "ae029012-a4dd-5104-9daa-d747884805df" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192" [weakdeps] AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" -JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" +ImplicitGlobalGrid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" Meshing = "e6723b4c-ebff-59f1-b4b7-d97aa5274f73" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" -ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3" -WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192" [extensions] WaterLilyAMDGPUExt = "AMDGPU" WaterLilyCUDAExt = "CUDA" -WaterLilyMakieExt = "Makie" WaterLilyJLD2Ext = "JLD2" +WaterLilyMPIExt = ["ImplicitGlobalGrid", "MPI"] +WaterLilyMakieExt = "Makie" WaterLilyMeshingExt = ["Makie", "Meshing"] WaterLilyPlotsExt = "Plots" WaterLilyReadVTKExt = "ReadVTK" @@ -43,12 +47,18 @@ ConstructionBase = "1.6.0" DocStringExtensions = "0.9" EllipsisNotation = "1.8" ForwardDiff = "0.10.18, 1" +GPUArrays = "11.4.1" +ImplicitGlobalGrid = "0.16" +JLD2 = "0.6.4" KernelAbstractions = "0.9.1" LoggingExtras = "1.1" +MPI = "0.20" Preferences = "1.4" +ReadVTK = "0.2.6" Reexport = "^1.2.2" Requires = "1.3" StaticArrays = "^1.1.0" +WriteVTK = "1.21.2" julia = "1.10" [extras] @@ -56,8 +66,10 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" +ImplicitGlobalGrid = "4d7a3746-15be-11ea-1130-334b0c4f5fa0" JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" +MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" Meshing = "e6723b4c-ebff-59f1-b4b7-d97aa5274f73" PerformanceTestTools = "dc46b164-d16f-48ec-a853-60448fc869fe" @@ -68,4 +80,4 @@ UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228" WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192" [targets] -test = ["Test", "BenchmarkTools", "CUDA", "AMDGPU", "GPUArrays", "WriteVTK", "ReadVTK", "JLD2"] +test = ["Test", "BenchmarkTools", "CUDA", "AMDGPU", "GPUArrays", "WriteVTK", "ReadVTK", "JLD2", "MPI", "ImplicitGlobalGrid"] diff --git a/ext/WaterLilyMPIExt.jl b/ext/WaterLilyMPIExt.jl new file mode 100644 index 00000000..516b10ba --- /dev/null +++ b/ext/WaterLilyMPIExt.jl @@ -0,0 +1,304 @@ +""" +WaterLilyMPIExt — Julia package extension +========================================== +Activated automatically when ImplicitGlobalGrid and MPI are loaded alongside +WaterLily. Provides MPI-aware overrides for global reductions, halo exchange, +and boundary conditions at MPI-subdomain interfaces. + +Uses the `AbstractParMode` dispatch pattern: serial WaterLily dispatches all +hooks through `par_mode[]` (defaults to `Serial()`). This extension defines +`Parallel <: AbstractParMode` and adds new dispatch methods — no method +overwriting, so precompilation works normally. + +Functions with MPI-specific behavior (via dispatch on `::Parallel`): + _wallBC_L! — zero L at physical walls only (skip MPI-internal) + halo on L + _exitBC! — global reductions for inflow/outflow mass flux + _divisible — same coarsening threshold as serial (N>4) + +Halo exchange uses a cached `_has_neighbors` flag to skip all exchange +when no MPI neighbors exist (e.g. np=1 non-periodic), eliminating the +overhead of IGG's `update_halo!` and buffer copies in that case. +""" +module WaterLilyMPIExt + +using WaterLily +import WaterLily: @loop +using ImplicitGlobalGrid +using MPI +using StaticArrays + +# ── MPI parallel mode ──────────────────────────────────────────────────────── + +struct Parallel <: WaterLily.AbstractParMode + comm::MPI.Comm +end + +_comm() = (WaterLily.par_mode[]::Parallel).comm + +# ── Global coordinate offset ────────────────────────────────────────────────── + +""" + _global_offset(Val(N), T, ::Parallel) → SVector{N,T} + +Rank-local origin in global WaterLily index space. + offset[d] = coords[d] * (nxyz[d] - overlaps[d]) = coords[d] * nx_loc +""" +function WaterLily._global_offset(::Val{N}, ::Type{T}, ::Parallel) where {N,T} + g = ImplicitGlobalGrid.global_grid() + SVector{N,T}(ntuple(d -> T(g.coords[d] * (g.nxyz[d] - g.overlaps[d])), N)) +end + +# ── MPI initialization ─────────────────────────────────────────────────────── + +""" + init_waterlily_mpi(global_dims; perdir=()) → (local_dims, rank, comm) + +Initialize MPI domain decomposition for WaterLily. + +1. Determines the optimal MPI topology via `MPI.Dims_create` +2. Computes local subdomain dimensions (`global_dims .÷ topology`) +3. Initializes ImplicitGlobalGrid with the correct overlaps and halowidths +4. Sets `par_mode[] = Parallel(comm)` for dispatch-based MPI hooks + +Returns `(local_dims::NTuple{N,Int}, rank::Int, comm::MPI.Comm)`. +""" +function WaterLily.init_waterlily_mpi(global_dims::NTuple{N}; perdir=()) where N + MPI.Initialized() || MPI.Init() + nprocs = MPI.Comm_size(MPI.COMM_WORLD) + + # Optimal MPI topology for N active dimensions + mpi_dims = Tuple(Int.(MPI.Dims_create(nprocs, zeros(Int, N)))) + + # Local interior dims + local_dims = global_dims .÷ mpi_dims + all(global_dims .== local_dims .* mpi_dims) || + error("Global dims $global_dims not evenly divisible by MPI topology " * + "$mpi_dims with $nprocs ranks") + + # Pad to 3D for IGG (which always expects 3 dimensions) + igg_local = ntuple(d -> d <= N ? local_dims[d] + 4 : 1, 3) + igg_mpi = ntuple(d -> d <= N ? mpi_dims[d] : 1, 3) + igg_per = ntuple(d -> d <= N && d in perdir ? 1 : 0, 3) + + me, dims, np, coords, comm = init_global_grid( + igg_local...; + dimx = igg_mpi[1], dimy = igg_mpi[2], dimz = igg_mpi[3], + overlaps = (4, 4, 4), + halowidths = (2, 2, 2), + periodx = igg_per[1], periody = igg_per[2], periodz = igg_per[3], + init_MPI = false, + ) + + WaterLily.par_mode[] = Parallel(comm) + _init_has_neighbors!() + + if me == 0 + topo = join(string.(dims[1:N]), "×") + loc = join(string.(local_dims), "×") + glob = join(string.(global_dims), "×") + @info "WaterLily MPI: $(np) ranks, topology=$(topo), " * + "local=$(loc), global=$(glob)" + end + + return local_dims, me, comm +end + +# ── Dimension helpers ───────────────────────────────────────────────────────── + +# Number of active spatial dimensions (nxyz > 1) in the IGG grid. +_ndims_active() = sum(ImplicitGlobalGrid.global_grid().nxyz .> 1) + +# True if any MPI neighbor exists in any active dimension (cached after init). +const _has_neighbors = Ref(false) +function _init_has_neighbors!() + g = ImplicitGlobalGrid.global_grid() + nd = _ndims_active() + _has_neighbors[] = any(g.neighbors[s, d] >= 0 for s in 1:2, d in 1:nd) +end + +# ── Scalar halo exchange (fine grid — via IGG) ─────────────────────────────── + +function _scalar_halo_igg!(arr::AbstractArray) + nd = _ndims_active() + if ndims(arr) < 3 + arr3d = reshape(arr, size(arr)..., ntuple(_->1, 3-ndims(arr))...) + update_halo!(arr3d; dims=ntuple(identity, nd)) + else + update_halo!(arr; dims=ntuple(identity, nd)) + end +end + +# ── Direct MPI halo exchange (any array size) ──────────────────────────────── +# +# IGG pre-allocates MPI send/recv buffers sized for the registered fine grid. +# Calling update_halo! on coarse multigrid arrays produces garbage. This +# function performs a direct MPI halo exchange using Isend/Irecv! with freshly +# allocated buffers sized for the actual array. It exchanges 2-cell-wide +# slabs in each active spatial dimension. + +# Pre-allocated MPI send/recv buffers keyed by (eltype, slab_shape, dim_tag). +const _mpi_bufs = Dict{Tuple, NTuple{4,Array}}() + +function _get_mpi_bufs(::Type{T}, slab_shape::Tuple, dim::Int) where T + get!(_mpi_bufs, (T, slab_shape, dim)) do + (zeros(T, slab_shape), zeros(T, slab_shape), + zeros(T, slab_shape), zeros(T, slab_shape)) + end +end + +function _slab(arr::AbstractArray, dim::Int, r::UnitRange) + colons = ntuple(i -> i == dim ? r : (:), ndims(arr)) + @view arr[colons...] +end + +# Pre-allocated request buffer (max 4 requests per dim exchange) +const _mpi_reqs = MPI.Request[MPI.REQUEST_NULL for _ in 1:4] + +function _scalar_halo_mpi!(arr::AbstractArray{T}) where T + g = ImplicitGlobalGrid.global_grid() + nd = _ndims_active() + N = size(arr) + comm = _comm() + for dim in 1:nd + nleft = g.neighbors[1, dim] + nright = g.neighbors[2, dim] + (nleft < 0 && nright < 0) && continue + + slab_shape = ntuple(i -> i == dim ? 2 : N[i], ndims(arr)) + send_left, recv_left, send_right, recv_right = _get_mpi_bufs(T, slab_shape, dim) + + # Pack send buffers + copyto!(send_left, _slab(arr, dim, 3:4)) + copyto!(send_right, _slab(arr, dim, N[dim]-3:N[dim]-2)) + + # Post all non-blocking sends/recvs, then Waitall for max overlap + nreqs = 0 + if nright >= 0 + nreqs += 1; _mpi_reqs[nreqs] = MPI.Isend(send_right, comm; dest=nright, tag=dim*10) + nreqs += 1; _mpi_reqs[nreqs] = MPI.Irecv!(recv_right, comm; source=nright, tag=dim*10+1) + end + if nleft >= 0 + nreqs += 1; _mpi_reqs[nreqs] = MPI.Isend(send_left, comm; dest=nleft, tag=dim*10+1) + nreqs += 1; _mpi_reqs[nreqs] = MPI.Irecv!(recv_left, comm; source=nleft, tag=dim*10) + end + MPI.Waitall(MPI.RequestSet(_mpi_reqs[1:nreqs])) + + # Unpack recv buffers + if nleft >= 0 + copyto!(_slab(arr, dim, 1:2), recv_left) + end + if nright >= 0 + copyto!(_slab(arr, dim, N[dim]-1:N[dim]), recv_right) + end + end +end + +# ── Unified scalar halo exchange ───────────────────────────────────────────── + +function _is_fine(arr::AbstractArray) + g = ImplicitGlobalGrid.global_grid() + nd = _ndims_active() + size(arr)[1:nd] == Tuple(g.nxyz[1:nd]) +end + +function _do_scalar_halo!(arr::AbstractArray) + _has_neighbors[] || return + if _is_fine(arr) + _scalar_halo_igg!(arr) + else + _scalar_halo_mpi!(arr) + end +end + +# ── Vector (velocity-shaped) halo exchange ──────────────────────────────────── + +const _halo_bufs = Dict{Tuple, Array}() + +function _get_halo_buf(::Type{T}, dims::NTuple{N,Int}) where {T,N} + get!(() -> Array{T}(undef, dims), _halo_bufs, (T, dims)) +end + +function _do_velocity_halo!(u::AbstractArray{T,N}) where {T,N} + _has_neighbors[] || return + D = size(u, N) # number of components (last dim) + sp = ntuple(_ -> :, N-1) # all spatial dims as Colons + sdims = size(u)[1:N-1] # spatial dimensions + tmp = _get_halo_buf(T, sdims) # single pre-allocated buffer + for d in 1:D + copyto!(tmp, @view u[sp..., d]) + _do_scalar_halo!(tmp) + copyto!(@view(u[sp..., d]), tmp) + end +end + +# ── Dispatch hooks for Parallel ────────────────────────────────────────────── +WaterLily._global_allreduce(x, ::Parallel) = MPI.Allreduce(x, MPI.SUM, _comm()) +WaterLily._global_min(a, b, ::Parallel) = MPI.Allreduce(min(a, b), MPI.MIN, _comm()) +WaterLily._scalar_halo!(x, ::Parallel) = _do_scalar_halo!(x) +WaterLily._velocity_halo!(u, ::Parallel) = _do_velocity_halo!(u) + +# Communication hooks: in parallel, MPI halo handles periodicity +WaterLily._comm!(a, perdir, ::Parallel) = _do_scalar_halo!(a) +WaterLily._velocity_comm!(a, perdir, ::Parallel) = _do_velocity_halo!(a) + +# ── MPI-aware exitBC! ──────────────────────────────────────────────────────── + +function WaterLily._exitBC!(u, u⁰, Δt, ::Parallel) + g = ImplicitGlobalGrid.global_grid() + comm = _comm() + N, _ = WaterLily.size_u(u) + + is_inflow = g.neighbors[1, 1] < 0 + is_exit = g.neighbors[2, 1] < 0 + + # All ranks participate in Allreduce for exit face area + local_exit_len = is_exit ? length(WaterLily.slice(N .- 2, N[1] - 1, 1, 3)) : 0 + global_exit_len = MPI.Allreduce(local_exit_len, MPI.SUM, comm) + + # All ranks participate in Allreduce for mean inflow velocity + local_inflow_sum = is_inflow ? sum(@view(u[WaterLily.slice(N .- 2, 2, 1, 3), 1])) : zero(eltype(u)) + U = MPI.Allreduce(local_inflow_sum, MPI.SUM, comm) / global_exit_len + + # Convective exit on rightmost-x ranks only + if is_exit + exitR = WaterLily.slice(N .- 2, N[1] - 1, 1, 3) + @loop u[I, 1] = u⁰[I, 1] - U * Δt * (u⁰[I, 1] - u⁰[I - WaterLily.δ(1, I), 1]) over I ∈ exitR + end + + # All ranks participate in Allreduce for mass flux correction + local_exit_sum = is_exit ? sum(@view(u[WaterLily.slice(N .- 2, N[1] - 1, 1, 3), 1])) : zero(eltype(u)) + global_exit_sum = MPI.Allreduce(local_exit_sum, MPI.SUM, comm) + ∮u = global_exit_sum / global_exit_len - U + if is_exit + exitR = WaterLily.slice(N .- 2, N[1] - 1, 1, 3) + @loop u[I, 1] -= ∮u over I ∈ exitR + end + + _do_velocity_halo!(u) +end + +# ── MPI-aware wallBC_L! ────────────────────────────────────────────────────── + +function WaterLily._wallBC_L!(L, perdir, ::Parallel) + g = ImplicitGlobalGrid.global_grid() + N, n = WaterLily.size_u(L) + for j in 1:n + j in perdir && continue + if g.neighbors[1, j] < 0 # physical left wall + @loop L[I,j] = zero(eltype(L)) over I ∈ WaterLily.slice(N, 3, j) + end + if g.neighbors[2, j] < 0 # physical right wall + @loop L[I,j] = zero(eltype(L)) over I ∈ WaterLily.slice(N, N[j]-1, j) + end + end + _do_velocity_halo!(L) +end + +# ── MPI-aware divisible ─────────────────────────────────────────────────────── +# Same threshold as serial (N>4). Coarse-level comm cost is negligible thanks +# to `_has_neighbors` short-circuiting (no exchange when no MPI neighbors exist) +# and tiny array sizes at the coarsest levels. + +WaterLily._divisible(N, ::Parallel) = mod(N,2)==0 && N>4 + +end # module WaterLilyMPIExt diff --git a/ext/WaterLilyWriteVTKExt.jl b/ext/WaterLilyWriteVTKExt.jl index a8224dbd..8cb4e2a6 100644 --- a/ext/WaterLilyWriteVTKExt.jl +++ b/ext/WaterLilyWriteVTKExt.jl @@ -42,13 +42,19 @@ to put in the file. With this approach, any variable can be save to the vtk file _velocity(a::AbstractSimulation) = a.flow.u |> Array; _pressure(a::AbstractSimulation) = a.flow.p |> Array; default_attrib() = Dict("Velocity"=>_velocity, "Pressure"=>_pressure) + """ - save!(w::VTKWriter, sim<:AbstractSimulation) + save!(w::VTKWriter, a::AbstractSimulation) -Write the simulation data at time `sim_time(sim)` to a `vti` file and add the file path -to the collection file. +Write the simulation data to VTK files. Dispatches on `par_mode[]`: + - Serial → single `.vti` file + - Parallel → per-rank `.vti` pieces + `.pvti` header (rank 0) """ function save!(w::VTKWriter, a::AbstractSimulation) + _save!(w, a, WaterLily.par_mode[]) +end + +function _save!(w::VTKWriter, a::AbstractSimulation, ::WaterLily.Serial) k = w.count[1]; N=size(a.flow.p) vtk = vtk_grid(w.dir_name*@sprintf("/%s_%06i", w.fname, k), [1:n for n in N]...) for (name,func) in w.output_attrib @@ -58,6 +64,63 @@ function save!(w::VTKWriter, a::AbstractSimulation) vtk_save(vtk); w.count[1]=k+1 w.collection[round(sim_time(a),digits=4)]=vtk end + +function _save!(w::VTKWriter, a::AbstractSimulation, ::WaterLily.AbstractParMode) + mpi_id = Base.PkgId(Base.UUID("da04e1cc-30fd-572f-bb4f-1f8673147195"), "MPI") + igg_id = Base.PkgId(Base.UUID("4d7a3746-15be-11ea-1130-334b0c4f5fa0"), "ImplicitGlobalGrid") + + if !haskey(Base.loaded_modules, mpi_id) || !haskey(Base.loaded_modules, igg_id) + _save!(w, a, WaterLily.Serial()); return + end + + MPIMod = Base.loaded_modules[mpi_id] + IGG = Base.loaded_modules[igg_id] + + comm = MPIMod.COMM_WORLD + me = MPIMod.Comm_rank(comm) + nprocs = MPIMod.Comm_size(comm) + nprocs == 1 && (_save!(w, a, WaterLily.Serial()); return) + + g = IGG.global_grid() + nd = sum(g.nxyz .> 1) # active spatial dimensions + nx = ntuple(d -> g.nxyz[d] - g.overlaps[d], nd) # interior cells per rank + + # Gather every rank's coords so all ranks build the same extents array. + my_coords = Int32[g.coords[d] for d in 1:nd] + all_coords = MPIMod.Allgather(my_coords, comm) + coords_mat = reshape(all_coords, nd, nprocs) + + # Extents use VTK point indices (nx+1 points → nx cells). + # Adjacent ranks overlap by 1 point so ParaView sees contiguous pieces. + extents = [ + ntuple(d -> (Int(coords_mat[d,r]) * nx[d] + 1):(Int(coords_mat[d,r]) * nx[d] + nx[d] + 1), nd) + for r in 1:nprocs + ] + + k = w.count[1] + fname = joinpath(w.dir_name, @sprintf("%s_%06i", w.fname, k)) + + pvtk = pvtk_grid(fname, + ntuple(d -> collect(Float32, extents[me+1][d]), nd)...; + part = me + 1, extents = extents) + + for (name, func) in w.output_attrib + data = func(a) + if ndims(data) > nd # vector field (spatial..., D) + pvtk[name, VTKCellData()] = components_first(data) + else # scalar field + pvtk[name, VTKCellData()] = data + end + end + + if me == 0 + w.collection[round(sim_time(a), digits=4)] = pvtk + else + vtk_save(pvtk) + end + w.count[1] = k + 1 +end + """ close(w::VTKWriter) diff --git a/src/Body.jl b/src/Body.jl index d855a138..a0ef6656 100644 --- a/src/Body.jl +++ b/src/Body.jl @@ -46,8 +46,9 @@ function measure!(a::Flow{N,T},body::AbstractBody;t=zero(T),ϵ=1) where {N,T} end end @loop fill!(a.μ₀,a.μ₁,a.V,a.σ,I) over I ∈ inside(a.p) - BC!(a.μ₀,zeros(SVector{N,T}),false,a.perdir) # BC on μ₀, don't fill normal component yet + BC!(a.μ₀,zeros(SVector{N,T}),false,a.perdir) BC!(a.V ,zeros(SVector{N,T}),a.exitBC,a.perdir) + velocity_halo!(reshape(a.μ₁, size(a.σ)..., :)) # halo on μ₁ tensor (no-op in serial) end # Convolution kernel and its moments @@ -79,7 +80,7 @@ Use for a simulation without a body. """ struct NoBody <: AbstractBody end measure(::NoBody,x::AbstractVector,args...;kwargs...)=(Inf,zero(x),zero(x)) -function measure!(::Flow,::NoBody;kwargs...) end # skip measure! entirely +function measure!(::Flow,::NoBody;kwargs...) end # μ₀ already ones from Flow constructor """ SetBody @@ -93,6 +94,29 @@ struct SetBody{O<:Function,Ta<:AbstractBody,Tb<:AbstractBody} <: AbstractBody b::Tb end +""" + OffsetBody(body, offset) <: AbstractBody + +Wraps any `AbstractBody` so that rank-local coordinates are shifted by +`offset` to global coordinates before `measure`/`sdf` evaluation. +Created automatically by `_apply_offset`; users should not construct directly. +""" +struct OffsetBody{B<:AbstractBody,O} <: AbstractBody + body::B + offset::O +end +measure(b::OffsetBody,x,t;kwargs...) = measure(b.body, x .+ b.offset, t; kwargs...) +sdf(b::OffsetBody,x,t=0;kwargs...) = sdf(b.body, x .+ b.offset, t; kwargs...) + +""" + _apply_offset(body::AbstractBody, offset) + +Wrap the body in an `OffsetBody` so that rank-local coordinates are shifted +by `offset` before evaluation. Works for any `AbstractBody` subtype. +""" +_apply_offset(body::AbstractBody, offset) = OffsetBody(body, offset) +_apply_offset(body::NoBody, offset) = body + # Lazy constructors Base.:∪(a::AbstractBody, b::AbstractBody) = SetBody(min,a,b) Base.:+(a::AbstractBody, b::AbstractBody) = a ∪ b diff --git a/src/Flow.jl b/src/Flow.jl index bcdd8aa1..b29072e0 100644 --- a/src/Flow.jl +++ b/src/Flow.jl @@ -6,9 +6,6 @@ @fastmath cds(u,c,d) = (c+d)/2 @inline ϕu(a,I,f,u,λ) = @inbounds u>0 ? u*λ(f[I-2δ(a,I)],f[I-δ(a,I)],f[I]) : u*λ(f[I+δ(a,I)],f[I],f[I-δ(a,I)]) -@inline ϕuP(a,Ip,I,f,u,λ) = @inbounds u>0 ? u*λ(f[Ip],f[I-δ(a,I)],f[I]) : u*λ(f[I+δ(a,I)],f[I],f[I-δ(a,I)]) -@inline ϕuL(a,I,f,u,λ) = @inbounds u>0 ? u*ϕ(a,I,f) : u*λ(f[I+δ(a,I)],f[I],f[I-δ(a,I)]) -@inline ϕuR(a,I,f,u,λ) = @inbounds u<0 ? u*ϕ(a,I,f) : u*λ(f[I-2δ(a,I)],f[I-δ(a,I)],f[I]) @fastmath @inline function div(I::CartesianIndex{m},u) where {m} init=zero(eltype(u)) @@ -35,32 +32,23 @@ function median(a,b,c) return a end +""" + conv_diff!(r, u, Φ, λ; ν=0.1, perdir=()) + +Compute the convective and diffusive fluxes of velocity `u` using the +convection scheme `λ(u,c,d)` and kinematic viscosity `ν`. The result is +accumulated into `r` (force per unit volume) and `Φ` is a working scalar. +""" function conv_diff!(r,u,Φ,λ::F;ν=0.1,perdir=()) where {F} r .= zero(eltype(r)) N,n = size_u(u) for i ∈ 1:n, j ∈ 1:n - # if it is periodic direction - tagper = (j in perdir) - # treatment for bottom boundary with BCs - lowerBoundary!(r,u,Φ,ν,i,j,N,λ,Val{tagper}()) - # inner cells @loop (Φ[I] = ϕu(j,CI(I,i),u,ϕ(i,CI(I,j),u),λ) - ν*∂(j,CI(I,i),u); r[I,i] += Φ[I]) over I ∈ inside_u(N,j) @loop r[I-δ(j,I),i] -= Φ[I] over I ∈ inside_u(N,j) - # treatment for upper boundary with BCs - upperBoundary!(r,u,Φ,ν,i,j,N,λ,Val{tagper}()) end end -# Neumann BC Building block -lowerBoundary!(r,u,Φ,ν,i,j,N,λ,::Val{false}) = @loop r[I,i] += ϕuL(j,CI(I,i),u,ϕ(i,CI(I,j),u),λ) - ν*∂(j,CI(I,i),u) over I ∈ slice(N,2,j,2) -upperBoundary!(r,u,Φ,ν,i,j,N,λ,::Val{false}) = @loop r[I-δ(j,I),i] += -ϕuR(j,CI(I,i),u,ϕ(i,CI(I,j),u),λ) + ν*∂(j,CI(I,i),u) over I ∈ slice(N,N[j],j,2) - -# Periodic BC Building block -lowerBoundary!(r,u,Φ,ν,i,j,N,λ,::Val{true}) = @loop ( - Φ[I] = ϕuP(j,CIj(j,CI(I,i),N[j]-2),CI(I,i),u,ϕ(i,CI(I,j),u),λ) -ν*∂(j,CI(I,i),u); r[I,i] += Φ[I]) over I ∈ slice(N,2,j,2) -upperBoundary!(r,u,Φ,ν,i,j,N,λ,::Val{true}) = @loop r[I-δ(j,I),i] -= Φ[CIj(j,I,2)] over I ∈ slice(N,N[j],j,2) - """ accelerate!(r,t,g,U) @@ -72,7 +60,7 @@ accelerate!(r,t,g::Function,::Union{Nothing,Tuple}) = accelerate!(r,t,g) accelerate!(r,t,::Nothing,U::Function) = accelerate!(r,t,(i,x,t)->ForwardDiff.derivative(τ->U(i,x,τ),t)) accelerate!(r,t,g::Function,U::Function) = accelerate!(r,t,(i,x,t)->g(i,x,t)+ForwardDiff.derivative(τ->U(i,x,τ),t)) """ - Flow{D::Int, T::Float, Sf<:AbstractArray{T,D}, Vf<:AbstractArray{T,D+1}, Tf<:AbstractArray{T,D+2}} + Flow{D, T, Sf, Vf, Tf} Composite type for a multidimensional immersed boundary flow simulation. @@ -80,6 +68,9 @@ Flow solves the unsteady incompressible [Navier-Stokes equations](https://en.wik Solid boundaries are modelled using the [Boundary Data Immersion Method](https://eprints.soton.ac.uk/369635/). The primary variables are the scalar pressure `p` (an array of dimension `D`) and the velocity vector field `u` (an array of dimension `D+1`). + +All arrays use the N+4 staggered layout: `N` interior cells plus 2 ghost/boundary +cells per side, giving total size `M = N + 4` per spatial dimension. """ struct Flow{D, T, Sf<:AbstractArray{T}, Vf<:AbstractArray{T}, Tf<:AbstractArray{T}} # Fluid fields @@ -96,12 +87,12 @@ struct Flow{D, T, Sf<:AbstractArray{T}, Vf<:AbstractArray{T}, Tf<:AbstractArray{ uBC :: Union{NTuple{D,Number},Function} # domain boundary values/function Δt:: Vector{T} # time step (stored in CPU memory) ν :: T # kinematic viscosity - g :: Union{Function,Nothing} # acceleration field funciton + g :: Union{Function,Nothing} # acceleration field function exitBC :: Bool # Convection exit perdir :: NTuple # tuple of periodic direction function Flow(N::NTuple{D}, uBC; f=Array, Δt=0.25, ν=0., g=nothing, uλ=nothing, perdir=(), exitBC=false, T=Float32) where D - Ng = N .+ 2 + Ng = N .+ 4 Nd = (Ng..., D) isnothing(uλ) && (uλ = ic_function(uBC)) u = Array{T}(undef, Nd...) |> f @@ -122,12 +113,26 @@ Current flow time. """ time(a::Flow) = sum(@view(a.Δt[1:end-1])) +""" + BDIM!(a::Flow) + +Apply the Boundary Data Immersion Method to enforce the body velocity. +Uses the zeroth and first kernel moments (`μ₀`, `μ₁`) to blend the +body velocity `V` with the predicted fluid velocity. +""" function BDIM!(a::Flow) dt = a.Δt[end] @loop a.f[Ii] = a.u⁰[Ii]+dt*a.f[Ii]-a.V[Ii] over Ii in CartesianIndices(a.f) @loop a.u[Ii] += μddn(Ii,a.μ₁,a.f)+a.V[Ii]+a.μ₀[Ii]*a.f[Ii] over Ii ∈ inside_u(size(a.p)) end +""" + project!(a::Flow, b::AbstractPoisson, w=1) + +Project the velocity field onto a divergence-free field by solving +the pressure Poisson equation `∇⋅(L∇p) = ∇⋅u/Δt` and correcting +the velocity: `u -= L∇p`. The weight `w` scales `Δt` (0.5 for the corrector). +""" function project!(a::Flow{n},b::AbstractPoisson,w=1) where n dt = w*a.Δt[end] @inside b.z[I] = div(I,a.u); b.x .*= dt # set source term & solution IC @@ -165,9 +170,15 @@ and the `AbstractPoisson` pressure solver to project the velocity onto an incomp end scale_u!(a,scale) = @loop a.u[Ii] *= scale over Ii ∈ inside_u(size(a.p)) +""" + CFL(a::Flow; Δt_max=10) + +Compute the CFL-limited time step from the maximum outward flux at each cell. +Uses `global_min` for MPI-safe reduction across all ranks. +""" function CFL(a::Flow;Δt_max=10) @inside a.σ[I] = flux_out(I,a.u) - min(Δt_max,inv(maximum(a.σ)+5a.ν)) + global_min(Δt_max,inv(maximum(a.σ)+5a.ν)) end @fastmath @inline function flux_out(I::CartesianIndex{d},u) where {d} s = zero(eltype(u)) diff --git a/src/Metrics.jl b/src/Metrics.jl index 18345972..3f2c0562 100644 --- a/src/Metrics.jl +++ b/src/Metrics.jl @@ -72,15 +72,18 @@ Compute ``∥𝛚∥`` at the center of cell `I`. """ ω_mag(I::CartesianIndex{3},u) = norm2(ω(I,u)) """ - ω_θ(I::CartesianIndex{3},z,center,u) + ω_θ(I::CartesianIndex{3}, z, center, u, offset=zero(SVector{3,eltype(u)})) Compute ``𝛚⋅𝛉`` at the center of cell `I` where ``𝛉`` is the azimuth -direction around vector `z` passing through `center`. +direction around vector `z` passing through `center`. Pass `offset` to +map rank-local indices to global coordinates in MPI parallel. +All arguments are GPU-capturable (arrays and value types). """ -function ω_θ(I::CartesianIndex{3},z,center,u) - θ = z × (loc(0,I,eltype(u))-SVector{3}(center)) +function ω_θ(I::CartesianIndex{3},z,center,u,offset=zero(SVector{3,eltype(u)})) + T = eltype(u) + θ = z × (loc(0,I,T)+offset-SVector{3}(center)) n = norm2(θ) - n<=eps(n) ? 0. : θ'*ω(I,u) / n + n<=eps(n) ? zero(T) : θ'*ω(I,u) / n end """ @@ -97,6 +100,8 @@ end pressure_force(sim::Simulation) Compute the pressure force on an immersed body. +MPI-aware: each rank computes its local contribution, then +`global_allreduce` sums across ranks. """ pressure_force(sim) = pressure_force(sim.flow,sim.body) pressure_force(flow,body) = pressure_force(flow.p,flow.f,body,time(flow)) @@ -104,7 +109,7 @@ function pressure_force(p,df,body,t=0) Tp = eltype(p); To = promote_type(Float64,Tp) df .= zero(Tp) @loop df[I,:] .= p[I]*nds(body,loc(0,I,Tp),t) over I ∈ inside(p) - sum(To,df,dims=ntuple(i->i,ndims(p)))[:] |> Array + global_allreduce(sum(To,df,dims=ntuple(i->i,ndims(p)))[:] |> Array) end """ @@ -119,6 +124,8 @@ S(I::CartesianIndex{3},u) = @SMatrix [0.5*(∂(i,j,I,u)+∂(j,i,I,u)) for i ∈ viscous_force(sim::Simulation) Compute the viscous force on an immersed body. +MPI-aware: each rank computes its local contribution, then +`global_allreduce` sums across ranks. """ viscous_force(sim) = viscous_force(sim.flow,sim.body) viscous_force(flow,body) = viscous_force(flow.u,flow.ν,flow.f,body,time(flow)) @@ -126,7 +133,7 @@ function viscous_force(u,ν,df,body,t=0) Tu = eltype(u); To = promote_type(Float64,Tu) df .= zero(Tu) @loop df[I,:] .= -2ν*S(I,u)*nds(body,loc(0,I,Tu),t) over I ∈ inside_u(u) - sum(To,df,dims=ntuple(i->i,ndims(u)-1))[:] |> Array + global_allreduce(sum(To,df,dims=ntuple(i->i,ndims(u)-1))[:] |> Array) end """ @@ -138,17 +145,24 @@ total_force(sim) = pressure_force(sim) .+ viscous_force(sim) using LinearAlgebra: cross """ - pressure_moment(x₀,sim::Simulation) + pressure_moment(x₀, sim::Simulation) -Computes the pressure moment on an immersed body relative to point x₀. +Compute the pressure moment on an immersed body relative to point `x₀`. +MPI-aware: each rank computes its local contribution, then +`global_allreduce` sums across ranks. + +The low-level method takes an explicit `offset` to map rank-local indices +to global coordinates (pass `global_offset(Val(N), T)`). """ pressure_moment(x₀,sim) = pressure_moment(x₀,sim.flow,sim.body) -pressure_moment(x₀,flow,body) = pressure_moment(x₀,flow.p,flow.f,body,time(flow)) -function pressure_moment(x₀,p,df,body,t=0) +function pressure_moment(x₀,flow::Flow{N,T},body,t=time(flow)) where {N,T} + pressure_moment(x₀,flow.p,flow.f,body,t,global_offset(Val(N),T)) +end +function pressure_moment(x₀,p,df,body,t,offset=zero(x₀)) Tp = eltype(p); To = promote_type(Float64,Tp) df .= zero(Tp) - @loop df[I,:] .= p[I]*cross(loc(0,I,Tp)-x₀,nds(body,loc(0,I,Tp),t)) over I ∈ inside(p) - sum(To,df,dims=ntuple(i->i,ndims(p)))[:] |> Array + @loop df[I,:] .= p[I]*cross(loc(0,I,Tp)+offset-x₀,nds(body,loc(0,I,Tp),t)) over I ∈ inside(p) + global_allreduce(sum(To,df,dims=ntuple(i->i,ndims(p)))[:] |> Array) end """ @@ -170,7 +184,7 @@ struct MeanFlow{T, Sf<:AbstractArray{T}, Vf<:AbstractArray{T}, Mf} new{T,typeof(P),typeof(U),typeof(UU)}(P,U,UU,T[t_init],uu_stats) end function MeanFlow(N::NTuple{D}; mem=Array, T=Float32, t_init=0, uu_stats=false) where {D} - Ng = N .+ 2 + Ng = N .+ 4 P = zeros(T, Ng) |> mem U = zeros(T, Ng..., D) |> mem UU = uu_stats ? zeros(T, Ng..., D, D) |> mem : nothing diff --git a/src/MultiLevelPoisson.jl b/src/MultiLevelPoisson.jl index b710464f..275f4b38 100644 --- a/src/MultiLevelPoisson.jl +++ b/src/MultiLevelPoisson.jl @@ -1,5 +1,21 @@ -@inline up(I::CartesianIndex,a=0) = (2I-2oneunit(I)):(2I-oneunit(I)-δ(a,I)) -@inline down(I::CartesianIndex) = CI((I+2oneunit(I)).I .÷2) +""" + up(I, a=0) + +Return the fine-grid `CartesianIndices` range that maps to coarse cell `I`. +When `a≠0`, the range is shifted by `-δ(a,I)` for staggered (face) restriction. +""" +@inline up(I::CartesianIndex,a=0) = (2I-3oneunit(I)):(2I-2oneunit(I)-δ(a,I)) +""" + down(I) + +Map fine-grid index `I` to the corresponding coarse-grid index. +""" +@inline down(I::CartesianIndex) = CI((I.I .- 1) .÷ 2 .+ 2) +""" + restrict(I, b) + +Sum the fine-grid scalar values in `b` that map to coarse cell `I`. +""" @fastmath @inline function restrict(I::CartesianIndex,b) s = zero(eltype(b)) for J ∈ up(I) @@ -7,6 +23,12 @@ end return s end +""" + restrictL(I, i, b) + +Restrict the Poisson conductivity `b` in dimension `i` from the fine grid +to coarse cell `I`, averaging the two fine-grid face values. +""" @fastmath @inline function restrictL(I::CartesianIndex,i,b) s = zero(eltype(b)) for J ∈ up(I,i) @@ -15,31 +37,43 @@ end return 0.5s end +""" + restrictML(b::Poisson) + +Build a new coarse-level `Poisson` from fine-level `b` by restricting the +conductivity `L` and allocating matching solution/residual arrays. +""" function restrictML(b::Poisson) N,n = size_u(b.L) - Na = map(i->1+i÷2,N) + Na = map(i->i÷2+2,N) aL = similar(b.L,(Na...,n)); fill!(aL,0) ax = similar(b.x,Na); fill!(ax,0) restrictL!(aL,b.L,perdir=b.perdir) Poisson(ax,aL,copy(ax);b.perdir) end +""" + restrictL!(a, b; perdir=()) + +Restrict the fine-grid conductivity `b` into coarse-grid `a`, then apply +boundary conditions (`BC!` and `wallBC_L!`) on the coarse level. +""" function restrictL!(a::AbstractArray{T,M},b;perdir=()) where {T,M} Na,n = size_u(a) for i ∈ 1:n - @loop a[I,i] = restrictL(I,i,b) over I ∈ CartesianIndices(map(n->2:n-1,Na)) + @loop a[I,i] = restrictL(I,i,b) over I ∈ CartesianIndices(map(n->3:n-2,Na)) end BC!(a,zero(SVector{M-1,T}),false,perdir) # correct μ₀ @ boundaries + wallBC_L!(a, perdir) end restrict!(a,b) = @inside a[I] = restrict(I,b) prolongate!(a,b) = @inside a[I] = b[down(I)] -@inline divisible(N) = mod(N,2)==0 && N>4 @inline divisible(l::Poisson) = all(size(l.x) .|> divisible) """ - MultiLevelPoisson{N,M} + MultiLevelPoisson{T,S,V} Composite type used to solve the pressure Poisson equation with a [geometric multigrid](https://en.wikipedia.org/wiki/Multigrid_method) method. -The only variable is `levels`, a vector of nested `Poisson` systems. +The main field is `levels`, a vector of nested `Poisson` systems from fine to coarse. """ struct MultiLevelPoisson{T,S<:AbstractArray{T},V<:AbstractArray{T}} <: AbstractPoisson{T,S,V} x::S @@ -67,6 +101,13 @@ function update!(ml::MultiLevelPoisson) end end +""" + Vcycle!(ml::MultiLevelPoisson; l=1, ω=1) + +Perform one multigrid V-cycle starting at level `l`: smooth on the fine grid, +restrict the residual, solve (or recurse) on the coarse grid, then prolongate +and correct the fine-grid solution. +""" function Vcycle!(ml::MultiLevelPoisson;l=1,ω=1) fine,coarse = ml.levels[l],ml.levels[l+1] # set up coarse level @@ -86,6 +127,13 @@ residual!(ml::MultiLevelPoisson,x) = residual!(ml.levels[1],x) smooth! = GaussSeidelRB! +""" + solver!(ml::MultiLevelPoisson; tol=1e-4, itmx=32) + +Multigrid solver: iterates V-cycles with adaptive relaxation `ω` until the +`L₂`-norm residual drops below `tol`. Ends with `pin_pressure!` + `comm!` +to remove the null-space mode and synchronize halos. +""" function solver!(ml::MultiLevelPoisson{T};tol=1e-4,itmx=32) where T p = ml.levels[1] residual!(p); r₂ = L₂(p); ω = T(1) @@ -103,6 +151,6 @@ function solver!(ml::MultiLevelPoisson{T};tol=1e-4,itmx=32) where T r₂ = rnew r₂1e2) && return # alpha should be O(1) @loop (x[I] += alpha*ϵ[I]; r[I] -= alpha*z[I]) over I ∈ inside(x) i==it && return @inside z[I] = r[I]*p.iD[I] - rho2 = r⋅z + rho2 = global_dot(r,z) abs(rho2)<10eps(T) && return beta = rho2/rho @inside ϵ[I] = beta*ϵ[I]+z[I] @@ -177,20 +184,23 @@ function pcg!(p::Poisson{T};it=6,kwargs...) where T end end -L₂(p::Poisson) = p.r ⋅ p.r # special method since outside(p.r)≡0 +L₂(p::Poisson) = global_dot(p.r, p.r) # special method since outside(p.r)≡0 L∞(p::Poisson) = maximum(abs,p.r) """ - solver!(A::Poisson;tol=1e-4,itmx=1e3) + solver!(A::Poisson; tol=1e-4, itmx=1e3) -Approximate iterative solver for the Poisson matrix equation `Ax=b`. +Iterative solver for the Poisson matrix equation `Ax=b` using +preconditioned conjugate gradients (`pcg!`). - - `A`: Poisson matrix with working arrays. - - `A.x`: Solution vector. Can start with an initial guess. - - `A.z`: Right-Hand-Side vector. Will be overwritten! - - `A.n[end]`: stores the number of iterations performed. + - `A.x`: Solution vector (can start with an initial guess). + - `A.z`: Right-hand-side vector (overwritten). + - `A.n[end]`: Number of iterations performed. - `tol`: Convergence tolerance on the `L₂`-norm residual. - `itmx`: Maximum number of iterations. + +Ends with `pin_pressure!` (remove null-space mean) and `comm!` +(halo sync) so the solution is ready for use in `project!`. """ function solver!(p::Poisson;tol=1e-4,itmx=1e3) residual!(p); r₂ = L₂(p) @@ -200,6 +210,6 @@ function solver!(p::Poisson;tol=1e-4,itmx=1e3) @log ", $nᵖ, $(L∞(p)), $r₂\n" r₂f(i,x.+o,t); end) + g isa Function && (g = let o=offset, f=g; (i,x,t)->f(i,x.+o,t); end) + end flow = Flow(dims,uBC;uλ,Δt,ν,g,T,f=mem,perdir,exitBC) measure!(flow,body;ϵ) - new(U,L,ϵ,flow,body,MultiLevelPoisson(flow.p,flow.μ₀,flow.σ;perdir)) + Lp = copy(flow.μ₀); wallBC_L!(Lp, perdir) + new(U,L,ϵ,flow,body,MultiLevelPoisson(flow.p,Lp,flow.σ;perdir)) end end @@ -123,6 +156,8 @@ Measure a dynamic `body` to update the `flow` and `pois` coefficients. """ function measure!(sim::AbstractSimulation,t=sum(sim.flow.Δt)) measure!(sim.flow,sim.body;t,ϵ=sim.ϵ) + sim.pois.levels[1].L .= sim.flow.μ₀ + wallBC_L!(sim.pois.levels[1].L, sim.pois.perdir) update!(sim.pois) end @@ -178,16 +213,32 @@ export viz!, get_body, plot_body_obs! Check the number of threads available for the Julia session that loads WaterLily. A warning is shown when running in serial (JULIA_NUM_THREADS=1) with KernelAbstractions enabled. +Skipped during precompilation (which always runs single-threaded). """ function check_nthreads() + ccall(:jl_generating_output, Cint, ()) != 0 && return # skip during precompilation if backend == "KernelAbstractions" && Threads.nthreads() == 1 @warn """ Using WaterLily in serial (ie. JULIA_NUM_THREADS=1) is not recommended because it defaults to serial CPU execution. Use JULIA_NUM_THREADS=auto, or any number of threads greater than 1, to allow multi-threading in CPU backends. - For a low-overhead single-threaded CPU only backend set: WaterLily.set_backend("SIMD") - """ + For a low-overhead single-threaded CPU only backend set: WaterLily.set_backend("SIMD")""" + end +end +function __init__() + check_nthreads() +end + +""" + check_mem(mem) + +Check that the memory type `mem` is compatible with the current backend. +GPU array types (anything other than `Array`) require the KernelAbstractions backend. +""" +function check_mem(mem) + if backend == "SIMD" && mem !== Array + error("GPU memory (mem=$mem) requires the KernelAbstractions backend. " * + "Set it with: WaterLily.set_backend(\"KernelAbstractions\") then restart the Julia session.") end end -check_nthreads() end # module diff --git a/src/util.jl b/src/util.jl index 9a3aa7b6..0299da00 100644 --- a/src/util.jl +++ b/src/util.jl @@ -1,5 +1,6 @@ using KernelAbstractions: get_backend, @index, @kernel using LoggingExtras +using LinearAlgebra: ⋅ # custom log macro _psolver = Logging.LogLevel(-123) # custom log level for pressure solver, needs the negative sign @@ -31,7 +32,6 @@ Replace jᵗʰ component of CartesianIndex with k CIj(j,I::CartesianIndex{d},k) where d = CI(ntuple(i -> i==j ? k : I[i], d)) """ - δ(i,N::Int) δ(i,I::CartesianIndex{N}) where {N} Return a CartesianIndex of dimension `N` which is one at index `i` and zero elsewhere. @@ -40,11 +40,12 @@ Return a CartesianIndex of dimension `N` which is one at index `i` and zero else δ(i,I::CartesianIndex{N}) where N = δ(i, Val{N}()) """ - inside(a;buff=1) + inside(a;buff=2) -Return CartesianIndices range excluding a single layer of cells on all boundaries. +Return CartesianIndices range excluding `buff` layers of cells on all boundaries. +Default `buff=2` matches the N+4 staggered grid layout (2 ghost/boundary cells per side). """ -@inline inside(a::AbstractArray;buff=1) = CartesianIndices(map(ax->first(ax)+buff:last(ax)-buff,axes(a))) +@inline inside(a::AbstractArray;buff=2) = CartesianIndices(map(ax->first(ax)+buff:last(ax)-buff,axes(a))) """ inside_u(dims,j) @@ -55,17 +56,13 @@ a _vector_ array on face `j` with size `dims`. function inside_u(dims::NTuple{N},j) where {N} CartesianIndices(ntuple( i-> i==j ? (3:dims[i]-1) : (2:dims[i]), N)) end -@inline inside_u(dims::NTuple{N}) where N = CartesianIndices((map(i->(2:i-1),dims)...,1:N)) -@inline inside_u(u::AbstractArray) = CartesianIndices(map(i->(2:i-1),size(u)[1:end-1])) +@inline inside_u(dims::NTuple{N}) where N = CartesianIndices((map(i->(3:i-2),dims)...,1:N)) +@inline inside_u(u::AbstractArray) = CartesianIndices(map(i->(3:i-2),size(u)[1:end-1])) splitn(n) = Base.front(n),last(n) size_u(u) = splitn(size(u)) -""" - L₂(a) - -L₂ norm of array `a` excluding ghosts. -""" -L₂(a) = sum(abs2,@inbounds(a[I]) for I ∈ inside(a)) +local_dot(a, b) = a⋅b +local_sum(a) = sum(a) """ @inside @@ -92,6 +89,13 @@ end # Could also use ScopedValues in Julia 1.11+ using Preferences const backend = @load_preference("backend", "KernelAbstractions") +""" + set_backend(new_backend::String) + +Set the loop execution backend to `"SIMD"` (single-threaded) or +`"KernelAbstractions"` (multi-threaded CPU / GPU). The preference is +persisted via Preferences.jl; a Julia restart is required. +""" function set_backend(new_backend::String) if !(new_backend in ("SIMD", "KernelAbstractions")) throw(ArgumentError("Invalid backend: \"$(new_backend)\"")) @@ -179,7 +183,7 @@ using StaticArrays loc(i,I) = loc(Ii) Location in space of the cell at CartesianIndex `I` at face `i`. -Using `i=0` returns the cell center s.t. `loc = I`. +Using `i=0` returns the cell center s.t. `loc(0,I) = I .- 1.5`. """ @inline loc(i,I::CartesianIndex{N},T=Float32) where N = SVector{N,T}(I.I .- 1.5 .- 0.5 .* δ(i,I).I) @inline loc(Ii::CartesianIndex,T=Float32) = loc(last(Ii),Base.front(Ii),T) @@ -192,57 +196,182 @@ Apply a vector function `f(i,x)` to the faces of a uniform staggered array `c` o a function `f(x)` to the center of a uniform array `c`. """ apply!(f,c) = hasmethod(f,Tuple{Int,CartesianIndex}) ? applyV!(f,c) : applyS!(f,c) -applyV!(f,c) = @loop c[Ii] = f(last(Ii),loc(Ii,eltype(c))) over Ii ∈ CartesianIndices(c) -applyS!(f,c) = @loop c[I] = f(loc(0,I,eltype(c))) over I ∈ CartesianIndices(c) +applyV!(f,c,offset=global_offset(Val(ndims(c)-1),eltype(c))) = @loop c[Ii] = f(last(Ii),loc(Ii,eltype(c))+offset) over Ii ∈ CartesianIndices(c) +applyS!(f,c,offset=global_offset(Val(ndims(c)),eltype(c))) = @loop c[I] = f(loc(0,I,eltype(c))+offset) over I ∈ CartesianIndices(c) """ slice(dims,i,j,low=1) Return `CartesianIndices` range slicing through an array of size `dims` in -dimension `j` at index `i`. `low` optionally sets the lower extent of the range -in the other dimensions. +dimension `j` at index `i` (or range `i`). `low` optionally sets the lower +extent of the range in the other dimensions. """ function slice(dims::NTuple{N},i,j,low=1) where N CartesianIndices(ntuple( k-> k==j ? (i:i) : (low:dims[k]), N)) end +function slice(dims::NTuple{N},i::AbstractUnitRange,j,low=1) where N + CartesianIndices(ntuple( k-> k==j ? i : (low:dims[k]), N)) +end + +# ── AbstractParMode dispatch pattern ────────────────────────────────────────── +# +# Serial WaterLily dispatches all hooks through `par_mode[]` (defaults to Serial()). +# The MPI extension adds `Parallel <: AbstractParMode` with MPI-aware methods — +# no method overwriting, so precompilation works normally. +abstract type AbstractParMode end +struct Serial <: AbstractParMode end +const par_mode = Ref{AbstractParMode}(Serial()) + +""" + global_dot(a, b) + +Global dot product `a⋅b`. In serial, equivalent to `a⋅b`. +The MPI extension replaces this with `MPI.Allreduce(…, SUM)`. +""" +global_dot(a, b) = global_allreduce(local_dot(a, b)) +""" + global_sum(a) + +Global sum of array `a`. MPI-aware via dispatch on `par_mode[]`. +""" +global_sum(a) = global_allreduce(local_sum(a)) +""" + global_length(r) +Global length of index range `r`. MPI-aware via dispatch on `par_mode[]`. """ - BC!(a,A) +global_length(r) = global_allreduce(length(r)) +""" + global_min(a, b) + +Global minimum of `a` and `b`. MPI-aware via dispatch on `par_mode[]`. +""" +global_min(a, b) = _global_min(a, b, par_mode[]) + +_global_min(a, b, ::Serial) = min(a, b) + +""" + global_allreduce(x) + +Reduce a pre-computed value `x` (scalar or vector) across all MPI ranks +with element-wise summation. In serial, returns `x` unchanged. +This is the primitive that other global reductions build on: +`global_sum(a) = global_allreduce(local_sum(a))`. +""" +global_allreduce(x) = _global_allreduce(x, par_mode[]) +_global_allreduce(x, ::Serial) = x + +""" + L₂(a) -Apply boundary conditions to the ghost cells of a _vector_ field. A Dirichlet -condition `a[I,i]=A[i]` is applied to the vector component _normal_ to the domain -boundary. For example `aₓ(x)=Aₓ ∀ x ∈ minmax(X)`. A zero Neumann condition -is applied to the tangential components. +L₂ norm of array `a` over interior cells (excluding ghost cells). +""" +L₂(a) = (R = inside(a); @view(a[R])⋅@view(a[R])) + +""" + global_perdot(a, b, perdir) + +Dot product of `a` and `b` respecting periodic boundary conditions. +When `perdir` is empty, uses the full arrays; otherwise restricts to interior cells. +MPI-aware via dispatch on `par_mode[]`. +""" +local_perdot(a,b,::Tuple{}) = a⋅b +local_perdot(a,b,perdir,R=inside(a)) = @view(a[R])⋅@view(b[R]) +global_perdot(a,b,tup::Tuple{}) = global_allreduce(local_perdot(a, b, tup)) +global_perdot(a,b,perdir,R=inside(a)) = global_allreduce(local_perdot(a, b, perdir, R)) + +""" + scalar_halo!(x) + +Exchange halo cells for scalar array `x`. No-op in serial. +The MPI extension routes fine-grid arrays through IGG `update_halo!` +and coarse multigrid arrays through direct `MPI.Isend`/`MPI.Irecv!`. +""" +scalar_halo!(x) = _scalar_halo!(x, par_mode[]) +""" + velocity_halo!(u) + +Exchange halo cells for a velocity (vector) array `u`. No-op in serial. +The MPI extension exchanges each component separately via `scalar_halo!`. +""" +velocity_halo!(u) = _velocity_halo!(u, par_mode[]) +_scalar_halo!(x, ::Serial) = nothing +_velocity_halo!(u, ::Serial) = nothing + +""" + pin_pressure!(x) + +Remove the mean of scalar field `x` over the interior cells. +This pins the null-space mode of the all-Neumann Poisson operator +so that the absolute pressure does not drift between time steps. +""" +function pin_pressure!(x) + s = global_sum(x)/global_length(inside(x)) + @inside x[I] = x[I] - s +end + +""" + wallBC_L!(L, perdir=()) + +Zero the Poisson conductivity `L` at physical (non-periodic) boundary faces. +This decouples the boundary cell from the ghost cell, giving an implicit +Neumann pressure BC (∂p/∂n = 0) at domain walls. +""" +wallBC_L!(L, perdir=()) = _wallBC_L!(L, perdir, par_mode[]) +function _wallBC_L!(L, perdir, ::Serial) + N, n = size_u(L) + for j in 1:n + j in perdir && continue + @loop L[I,j] = zero(eltype(L)) over I ∈ slice(N, 3, j) # left wall + @loop L[I,j] = zero(eltype(L)) over I ∈ slice(N, N[j]-1, j) # right wall + end +end + +""" + divisible(N) + +Check if array dimension `N` is divisible for multigrid coarsening. +""" +divisible(N) = _divisible(N, par_mode[]) +_divisible(N, ::Serial) = mod(N,2)==0 && N>4 + +""" + BC!(a, uBC, saveexit=false, perdir=(), t=0) + +Apply domain boundary conditions to the ghost cells of a _vector_ field. +A Dirichlet condition is applied to the _normal_ component; zero Neumann to tangential. +Periodic directions are handled by `velocity_comm!` (called at the end), +separating domain BCs from communication BCs. """ BC!(a,U,saveexit=false,perdir=(),t=0) = BC!(a,(i,x,t)->U[i],saveexit,perdir,t) function BC!(a,uBC::Function,saveexit=false,perdir=(),t=0) N,n = size_u(a) for i ∈ 1:n, j ∈ 1:n - if j in perdir - @loop a[I,i] = a[CIj(j,I,N[j]-1),i] over I ∈ slice(N,1,j) - @loop a[I,i] = a[CIj(j,I,2),i] over I ∈ slice(N,N[j],j) - else - if i==j # Normal direction, Dirichlet - for s ∈ (1,2) - @loop a[I,i] = uBC(i,loc(i,I),t) over I ∈ slice(N,s,j) - end - (!saveexit || i>1) && (@loop a[I,i] = uBC(i,loc(i,I),t) over I ∈ slice(N,N[j],j)) # overwrite exit - else # Tangential directions, Neumann - @loop a[I,i] = uBC(i,loc(i,I),t)+a[I+δ(j,I),i]-uBC(i,loc(i,I+δ(j,I)),t) over I ∈ slice(N,1,j) - @loop a[I,i] = uBC(i,loc(i,I),t)+a[I-δ(j,I),i]-uBC(i,loc(i,I-δ(j,I)),t) over I ∈ slice(N,N[j],j) + j in perdir && continue # periodic handled by velocity_comm! + if i==j # Normal direction, Dirichlet + @loop a[I,i] = uBC(i,loc(i,I),t) over I ∈ slice(N,1:2,j) + if !saveexit || i>1 + @loop a[I,i] = uBC(i,loc(i,I),t) over I ∈ slice(N,N[j]-1:N[j],j) + else + @loop a[I,i] = uBC(i,loc(i,I),t) over I ∈ slice(N,N[j],j) end + else # Tangential directions, Neumann: mirror about wall face (between 2,3 and M-2,M-1) + @loop a[I,i] = uBC(i,loc(i,I),t)+a[CIj(j,I,5-I[j]),i]-uBC(i,loc(i,CIj(j,I,5-I[j])),t) over I ∈ slice(N,1:2,j) + @loop a[I,i] = uBC(i,loc(i,I),t)+a[CIj(j,I,2N[j]-3-I[j]),i]-uBC(i,loc(i,CIj(j,I,2N[j]-3-I[j])),t) over I ∈ slice(N,N[j]-1:N[j],j) end end + velocity_comm!(a, perdir) end """ - exitBC!(u,u⁰,U,Δt) + exitBC!(u,u⁰,Δt) Apply a 1D convection scheme to fill the ghost cell on the exit of the domain. """ -function exitBC!(u,u⁰,Δt) +exitBC!(u,u⁰,Δt) = _exitBC!(u,u⁰,Δt,par_mode[]) +function _exitBC!(u,u⁰,Δt,::Serial) N,_ = size_u(u) - exitR = slice(N.-1,N[1],1,2) # exit slice excluding ghosts - U = sum(@view(u[slice(N.-1,2,1,2),1]))/length(exitR) # inflow mass flux + exitR = slice(N.-2,N[1]-1,1,3) # exit slice excluding ghosts + U = sum(@view(u[slice(N.-2,2,1,3),1]))/length(exitR) # inflow mass flux (at Dirichlet face) @loop u[I,1] = u⁰[I,1]-U*Δt*(u⁰[I,1]-u⁰[I-δ(1,I),1]) over I ∈ exitR ∮u = sum(@view(u[exitR,1]))/length(exitR)-U # mass flux imbalance @loop u[I,1] -= ∮u over I ∈ exitR # correct flux @@ -254,27 +383,44 @@ Apply periodic conditions to the ghost cells of a _scalar_ field. """ perBC!(a,::Tuple{}) = nothing perBC!(a, perdir, N = size(a)) = for j ∈ perdir - @loop a[I] = a[CIj(j,I,N[j]-1)] over I ∈ slice(N,1,j) - @loop a[I] = a[CIj(j,I,2)] over I ∈ slice(N,N[j],j) + @loop a[I] = a[CIj(j,I,I[j]+N[j]-4)] over I ∈ slice(N,1:2,j) + @loop a[I] = a[CIj(j,I,I[j]-N[j]+4)] over I ∈ slice(N,N[j]-1:N[j],j) end -using LinearAlgebra: ⋅ + """ - perdot(a,b,perdir) + comm!(a, perdir) -Apply dot product to the inner cells of two _scalar_ fields, assuming zero values in ghost cell when using Neumann BC. +Scalar communication: periodic BC copy + MPI halo exchange. +In serial, just applies `perBC!`. In parallel, MPI halo handles everything. """ -perdot(a,b,::Tuple{}) = a⋅b -perdot(a,b,perdir,R=inside(a)) = @view(a[R])⋅@view(b[R]) +comm!(a, perdir) = _comm!(a, perdir, par_mode[]) +_comm!(a, perdir, ::Serial) = perBC!(a, perdir) + """ - interp(x::SVector, arr::AbstractArray) + velocity_comm!(a, perdir) -Linear interpolation from array `arr` at Cartesian-coordinate `x`. +Velocity communication: periodic BC copy + MPI halo exchange. +In serial, copies periodic ghost cells for all velocity components. +In parallel, MPI halo handles everything. +""" +velocity_comm!(a, perdir) = _velocity_comm!(a, perdir, par_mode[]) +function _velocity_comm!(a, perdir, ::Serial) + _, n = size_u(a) + for i ∈ 1:n + perBC!(@view(a[..,i]), perdir) + end +end -Note: This routine works for any number of dimensions. """ -function interp(x::SVector{D,T}, arr::AbstractArray{T,D}) where {D,T} + interp(x::SVector, arr::AbstractArray, offset=zero(x)) + +Linear interpolation from array `arr` at Cartesian-coordinate `x`. +`offset` shifts `x` before indexing — used in MPI parallel to map +global coordinates to rank-local array indices (pass `flow.offset`). +""" +function interp(x::SVector{D,T}, arr::AbstractArray{T,D}, offset=zero(x)) where {D,T} # Index below the interpolation coordinate and the difference - x = x .+ 1.5f0; i = floor.(Int,x); y = x.-i + x = x .- offset .+ 1.5f0; i = floor.(Int,x); y = x.-i # CartesianIndices around x I = CartesianIndex(i...); R = I:I+oneunit(I) @@ -288,10 +434,10 @@ function interp(x::SVector{D,T}, arr::AbstractArray{T,D}) where {D,T} return s end using EllipsisNotation -function interp(x::SVector{D,T}, varr::AbstractArray{T}) where {D,T} +function interp(x::SVector{D,T}, varr::AbstractArray{T}, offset=zero(x)) where {D,T} # Shift to align with each staggered grid component and interpolate @inline shift(i) = SVector{D,T}(ifelse(i==j,0.5,0.) for j in 1:D) - return SVector{D,T}(interp(x+shift(i),@view(varr[..,i])) for i in 1:D) + return SVector{D,T}(interp(x+shift(i),@view(varr[..,i]),offset) for i in 1:D) end """ @@ -312,7 +458,7 @@ For example, the standard Smagorinsky–Lilly model for the sub-grid scale stres It can be implemented as `smagorinsky(I::CartesianIndex{m} where m; S, Cs, Δ) = @views (Cs*Δ)^2*sqrt(dot(S[I,:,:],S[I,:,:]))` -and passed into `sim_step!` as a keyword argument together with the varibles than the function needs (`S`, `Cs`, and `Δ`): +and passed into `sim_step!` as a keyword argument together with the variables that the function needs (`S`, `Cs`, and `Δ`): `sim_step!(sim, ...; udf=sgs, νₜ=smagorinsky, S, Cs, Δ)` """ function sgs!(flow, t; νₜ, S, Cs, Δ) @@ -339,3 +485,4 @@ ic_function(uBC::Function) = (i,x)->uBC(i,x,0) ic_function(uBC::Tuple) = (i,x)->uBC[i] squeeze(a::AbstractArray) = dropdims(a, dims = tuple(findall(size(a) .== 1)...)) + diff --git a/test/maintests.jl b/test/maintests.jl index 7f185e4c..ee358949 100644 --- a/test/maintests.jl +++ b/test/maintests.jl @@ -2,7 +2,9 @@ using GPUArrays using ReadVTK, WriteVTK, JLD2 backend != "KernelAbstractions" && throw(ArgumentError("SIMD backend not allowed to run main tests, use KernelAbstractions backend")) -@info "Test backends: $(join(arrays,", "))" +const test_filter = get(ENV, "WATERLILY_TEST", "") +skip(name) = !isempty(test_filter) && !occursin(test_filter, name) +@info "Test backends: $(join(arrays,", "))" * (isempty(test_filter) ? "" : " (filter: $test_filter)") @testset "util.jl" begin I = CartesianIndex(1,2,3,4) @test I+δ(3,I) == CartesianIndex(1,2,4,4) @@ -22,91 +24,119 @@ backend != "KernelAbstractions" && throw(ArgumentError("SIMD backend not allowed @test WaterLily.joinsymtype(sym,[:A,:B,:C]) == Expr[:(a::A), :(b::B), :(c::C)] for f ∈ arrays - p = zeros(4,5) |> f + p = zeros(6,7) |> f apply!(x->x[1]+x[2]+3,p) # add 2×1.5 to move edge to origin - @test inside(p) == CartesianIndices((2:3,2:4)) + @test inside(p) == CartesianIndices((3:4,3:5)) @test inside(p,buff=0) == CartesianIndices(p) - @test L₂(p) == 187 + @test L₂(p) == 343 - u = zeros(5,5,2) |> f + u = zeros(7,7,2) |> f apply!((i,x)->x[i],u) @test GPUArrays.@allowscalar [u[i,j,1].-(i-2) for i in 1:3, j in 1:3]==zeros(3,3) - Ng, D, U = (6, 6), 2, (1.0, 0.5) + Ng, D, U = (8, 8), 2, (1.0, 0.5) u = rand(Ng..., D) |> f # vector σ = rand(Ng...) |> f # scalar BC!(u, U) - @test GPUArrays.@allowscalar all(u[1, :, 1] .== U[1]) && all(u[2, :, 1] .== U[1]) && all(u[end, :, 1] .== U[1]) && - all(u[3:end-1, 1, 1] .== u[3:end-1, 2, 1]) && all(u[3:end-1, end, 1] .== u[3:end-1, end-1, 1]) - @test GPUArrays.@allowscalar all(u[:, 1, 2] .== U[2]) && all(u[:, 2, 2] .== U[2]) && all(u[:, end, 2] .== U[2]) && - all(u[1, 3:end-1, 2] .== u[2, 3:end-1, 2]) && all(u[end, 3:end-1, 2] .== u[end-1, 3:end-1, 2]) - - GPUArrays.@allowscalar u[end,:,1] .= 3 - BC!(u,U,true) # save exit values - @test GPUArrays.@allowscalar all(u[end, :, 1] .== 3) + # Normal Dirichlet: indices 1,2 (left) and M-1,M (right) + @test GPUArrays.@allowscalar all(u[1, :, 1] .== U[1]) && all(u[2, :, 1] .== U[1]) && + all(u[end-1, :, 1] .== U[1]) && all(u[end, :, 1] .== U[1]) + # Tangential Neumann: ghost cells mirror about wall face (2nd-order) + @test GPUArrays.@allowscalar all(u[3:end-2, 2, 1] .== u[3:end-2, 3, 1]) && + all(u[3:end-2, 1, 1] .== u[3:end-2, 4, 1]) && + all(u[3:end-2, end-1, 1] .== u[3:end-2, end-2, 1]) && + all(u[3:end-2, end, 1] .== u[3:end-2, end-3, 1]) + @test GPUArrays.@allowscalar all(u[:, 1, 2] .== U[2]) && all(u[:, 2, 2] .== U[2]) && + all(u[:, end-1, 2] .== U[2]) && all(u[:, end, 2] .== U[2]) + @test GPUArrays.@allowscalar all(u[2, 3:end-2, 2] .== u[3, 3:end-2, 2]) && + all(u[1, 3:end-2, 2] .== u[4, 3:end-2, 2]) && + all(u[end-1, 3:end-2, 2] .== u[end-2, 3:end-2, 2]) && + all(u[end, 3:end-2, 2] .== u[end-3, 3:end-2, 2]) + + GPUArrays.@allowscalar u[end-1,:,1] .= 3 + BC!(u,U,true) # save exit values (only end-1 is saved, end gets Dirichlet) + @test GPUArrays.@allowscalar all(u[end-1, :, 1] .== 3) WaterLily.exitBC!(u,u,0) # conservative exit check - @test GPUArrays.@allowscalar all(u[end,2:end-1, 1] .== U[1]) + @test GPUArrays.@allowscalar all(u[end-1,3:end-2, 1] .≈ U[1]) # test BC with function Ubc(i,x,t) = i==1 ? 1.0 : 0.5 v = rand(Ng..., D) |> f # vector BC!(v,Ubc,false); BC!(u,U,false) # make sure we apply the same - @test GPUArrays.@allowscalar all(v[1, :, 1] .== u[1, :, 1]) && all(v[2, :, 1] .== u[2, :, 1]) && all(v[end, :, 1] .== u[end, :, 1]) - @test GPUArrays.@allowscalar all(v[:, 1, 2] .== u[:, 1, 2]) && all(v[:, 2, 2] .== u[:, 2, 2]) && all(v[:, end, 2] .== u[:, end, 2]) + @test GPUArrays.@allowscalar all(v[1, :, 1] .== u[1, :, 1]) && all(v[2, :, 1] .== u[2, :, 1]) && + all(v[end-1, :, 1] .== u[end-1, :, 1]) && all(v[end, :, 1] .== u[end, :, 1]) + @test GPUArrays.@allowscalar all(v[:, 1, 2] .== u[:, 1, 2]) && all(v[:, 2, 2] .== u[:, 2, 2]) && + all(v[:, end-1, 2] .== u[:, end-1, 2]) && all(v[:, end, 2] .== u[:, end, 2]) # test exit bc - GPUArrays.@allowscalar v[end,:,1] .= 3 - BC!(v,Ubc,true) # save exit values - @test GPUArrays.@allowscalar all(v[end, :, 1] .== 3) + GPUArrays.@allowscalar v[end-1,:,1] .= 3 + BC!(v,Ubc,true) # save exit values (only end-1 is saved) + @test GPUArrays.@allowscalar all(v[end-1, :, 1] .== 3) BC!(u,U,true,(2,)) # periodic in y and save exit values - @test GPUArrays.@allowscalar all(u[:, 1:2, 1] .== u[:, end-1:end, 1]) && all(u[:, 1:2, 1] .== u[:,end-1:end,1]) + # periodic ghost cells: left ghosts copy from right interior and vice versa + @test GPUArrays.@allowscalar all(u[:, 1, 1] .== u[:, end-3, 1]) && all(u[:, 2, 1] .== u[:, end-2, 1]) && + all(u[:, end-1, 1] .== u[:, 3, 1]) && all(u[:, end, 1] .== u[:, 4, 1]) WaterLily.perBC!(σ,(1,2)) # periodic in two directions - @test GPUArrays.@allowscalar all(σ[1, 2:end-1] .== σ[end-1, 2:end-1]) && all(σ[2:end-1, 1] .== σ[2:end-1, end-1]) + @test GPUArrays.@allowscalar all(σ[1, :] .== σ[end-3, :]) && all(σ[2, :] .== σ[end-2, :]) && + all(σ[end-1, :] .== σ[3, :]) && all(σ[end, :] .== σ[4, :]) && + all(σ[:, 1] .== σ[:, end-3]) && all(σ[:, 2] .== σ[:, end-2]) && + all(σ[:, end-1] .== σ[:, 3]) && all(σ[:, end] .== σ[:, 4]) u = rand(Ng..., D) |> f # vector BC!(u,U,true,(1,)) #saveexit has no effect here as x-periodic - @test GPUArrays.@allowscalar all(u[1:2, :, 1] .== u[end-1:end, :, 1]) && all(u[1:2, :, 2] .== u[end-1:end, :, 2]) && - all(u[:, 1, 2] .== U[2]) && all(u[:, 2, 2] .== U[2]) && all(u[:, end, 2] .== U[2]) + # x-periodic: ghost cells copy from interior + @test GPUArrays.@allowscalar all(u[1, :, 1] .== u[end-3, :, 1]) && all(u[2, :, 1] .== u[end-2, :, 1]) && + all(u[end-1, :, 1] .== u[3, :, 1]) && all(u[end, :, 1] .== u[4, :, 1]) && + all(u[1, :, 2] .== u[end-3, :, 2]) && all(u[2, :, 2] .== u[end-2, :, 2]) && + all(u[end-1, :, 2] .== u[3, :, 2]) && all(u[end, :, 2] .== u[4, :, 2]) && + all(u[:, 1, 2] .== U[2]) && all(u[:, 2, 2] .== U[2]) && + all(u[:, end-1, 2] .== U[2]) && all(u[:, end, 2] .== U[2]) # test non-uniform BCs Ubc_1(i,x,t) = i==1 ? x[2] : x[1] v .= 0; BC!(v,Ubc_1) # the tangential BC change the value of the ghost cells on the other axis, so we cannot check it - @test GPUArrays.@allowscalar all(v[1,2:end-1,1] .≈ v[end,2:end-1,1]) - @test GPUArrays.@allowscalar all(v[2:end-1,1,2] .≈ v[2:end-1,end,2]) + @test GPUArrays.@allowscalar all(v[1,3:end-2,1] .≈ v[end,3:end-2,1]) + @test GPUArrays.@allowscalar all(v[3:end-2,1,2] .≈ v[3:end-2,end,2]) # more complex - Ng, D = (8, 8, 8), 3 + Ng, D = (10, 10, 10), 3 u = zeros(Ng..., D) |> f # vector - Ubc_2(i,x,t) = i==1 ? cos(2π*x[1]/8) : i==2 ? sin(2π*x[2]/8) : tan(π*x[3]/16) + Ubc_2(i,x,t) = i==1 ? cos(2π*x[1]/10) : i==2 ? sin(2π*x[2]/10) : tan(π*x[3]/20) BC!(u,Ubc_2) - @test GPUArrays.@allowscalar all(u[1,:,:,1] .≈ cos(-1π/4)) && all(u[2,:,:,1] .≈ cos(0)) && all(u[end,:,:,1] .≈ cos(6π/4)) - @test GPUArrays.@allowscalar all(u[:,1,:,2] .≈ sin(-1π/4)) && all(u[:,2,:,2] .≈ sin(0)) && all(u[:,end,:,2] .≈ sin(6π/4)) - @test GPUArrays.@allowscalar all(u[:,:,1,3] .≈ tan(-1π/16)) && all(u[:,:,2,3] .≈ tan(0)) && all(u[:,:,end,3].-tan(6π/16).<1e-6) + # With N+4: index 1 → loc(i,1) = 1-2 = -1, index 2 → 0, index end-1 → M-2.5, index end → M-1.5 + # For M=10: normal x at indices 1,2,end-1,end → x = -1, 0, 7, 8 (face coords) + @test GPUArrays.@allowscalar all(u[1,:,:,1] .≈ cos(2π*(-1)/10)) && all(u[2,:,:,1] .≈ cos(0)) && + all(u[end-1,:,:,1] .≈ cos(2π*7/10)) && all(u[end,:,:,1] .≈ cos(2π*8/10)) + @test GPUArrays.@allowscalar all(u[:,1,:,2] .≈ sin(2π*(-1)/10)) && all(u[:,2,:,2] .≈ sin(0)) && + all(u[:,end-1,:,2] .≈ sin(2π*7/10)) && all(u[:,end,:,2] .≈ sin(2π*8/10)) + @test GPUArrays.@allowscalar all(u[:,:,1,3] .≈ tan(π*(-1)/20)) && all(u[:,:,2,3] .≈ tan(0)) && + all(u[:,:,end-1,3].-tan(π*7/20).<1e-6) && all(u[:,:,end,3].-tan(π*8/20).<1e-6) # test interpolation, test on two different array type - a = zeros(Float32,8,8,2) |> f; b = zeros(Float64,8,8) |> f + a = zeros(Float32,10,10,2) |> f; b = zeros(Float64,10,10) |> f apply!((i,x)->x[i],a); apply!(x->x[1],b) # offset for start of grid - @test GPUArrays.@allowscalar all(WaterLily.interp(SVector(2.5f0,1.f0),a) .≈ [2.5f0,1.0f0]) - @test GPUArrays.@allowscalar all(WaterLily.interp(SVector(3.5f0,3.f0),a) .≈ [3.5f0,3.0f0]) - @test GPUArrays.@allowscalar eltype(WaterLily.interp(SVector(2.5f0,1.f0),a))==Float32 - @test_throws MethodError GPUArrays.@allowscalar WaterLily.interp(SVector(2.50,1.0),a) - @test GPUArrays.@allowscalar WaterLily.interp(SVector(2.5,1),b) ≈ 2.5 - @test GPUArrays.@allowscalar WaterLily.interp(SVector(3.5,3),b) ≈ 3.5 - @test GPUArrays.@allowscalar eltype(WaterLily.interp(SVector(3.5,3),b))==Float64 - @test_throws MethodError GPUArrays.@allowscalar WaterLily.interp(SVector(2.5f0,1.f0),b) - - # test on perdot + @test GPUArrays.@allowscalar all(WaterLily.interp(SVector(3.5f0,2.f0),a) .≈ [3.5f0,2.0f0]) + @test GPUArrays.@allowscalar all(WaterLily.interp(SVector(4.5f0,4.f0),a) .≈ [4.5f0,4.0f0]) + @test GPUArrays.@allowscalar eltype(WaterLily.interp(SVector(3.5f0,2.f0),a))==Float32 + @test_throws MethodError GPUArrays.@allowscalar WaterLily.interp(SVector(3.50,2.0),a) + @test GPUArrays.@allowscalar WaterLily.interp(SVector(3.5,2),b) ≈ 3.5 + @test GPUArrays.@allowscalar WaterLily.interp(SVector(4.5,4),b) ≈ 4.5 + @test GPUArrays.@allowscalar eltype(WaterLily.interp(SVector(4.5,4),b))==Float64 + @test_throws MethodError GPUArrays.@allowscalar WaterLily.interp(SVector(3.5f0,2.f0),b) + + # test on perdot (Ng is (10,10,10) from above) σ1 = rand(Ng...) |> f # scalar - σ2 = rand(Ng...) |> f # another scalar + σ2 = rand(Ng...) |> f # another scalar # use ≈ instead of == as summation in different order might result in slight difference in floating point expressions - @test GPUArrays.@allowscalar WaterLily.perdot(σ1,σ2,()) ≈ sum(σ1[I]*σ2[I] for I∈CartesianIndices(σ1)) - @test GPUArrays.@allowscalar WaterLily.perdot(σ1,σ2,(1,)) ≈ sum(σ1[I]*σ2[I] for I∈inside(σ1)) - @test GPUArrays.@allowscalar WaterLily.perdot(σ1,σ2,(1,2)) ≈ sum(σ1[I]*σ2[I] for I∈inside(σ1)) + @test GPUArrays.@allowscalar WaterLily.global_perdot(σ1,σ2,()) ≈ sum(σ1[I]*σ2[I] for I∈CartesianIndices(σ1)) + @test GPUArrays.@allowscalar WaterLily.global_perdot(σ1,σ2,(1,)) ≈ sum(σ1[I]*σ2[I] for I∈inside(σ1)) + @test GPUArrays.@allowscalar WaterLily.global_perdot(σ1,σ2,(1,2)) ≈ sum(σ1[I]*σ2[I] for I∈inside(σ1)) end end function Poisson_setup(poisson,N::NTuple{D};f=Array,T=Float32) where D c = ones(T,N...,D) |> f; BC!(c, ntuple(zero,D)) + WaterLily.wallBC_L!(c) x = zeros(T,N) |> f; z = copy(x) pois = poisson(x,c,z) soln = map(I->T(I.I[1]),CartesianIndices(N)) |> f @@ -120,14 +150,29 @@ end @testset "Poisson.jl" begin for f ∈ arrays - err,pois = Poisson_setup(Poisson,(5,5);f) - @test GPUArrays.@allowscalar parent(pois.D)==f(Float32[0 0 0 0 0; 0 -2 -3 -2 0; 0 -3 -4 -3 0; 0 -2 -3 -2 0; 0 0 0 0 0]) - @test GPUArrays.@allowscalar parent(pois.iD)≈f(Float32[0 0 0 0 0; 0 -1/2 -1/3 -1/2 0; 0 -1/3 -1/4 -1/3 0; 0 -1/2 -1/3 -1/2 0; 0 0 0 0 0]) + # 7×7 array → 3×3 interior (indices 3:5) + err,pois = Poisson_setup(Poisson,(7,7);f) + @test GPUArrays.@allowscalar parent(pois.D)==f(Float32[ + 0 0 0 0 0 0 0; + 0 0 0 0 0 0 0; + 0 0 -2 -3 -2 0 0; + 0 0 -3 -4 -3 0 0; + 0 0 -2 -3 -2 0 0; + 0 0 0 0 0 0 0; + 0 0 0 0 0 0 0]) + @test GPUArrays.@allowscalar parent(pois.iD)≈f(Float32[ + 0 0 0 0 0 0 0; + 0 0 0 0 0 0 0; + 0 0 -1/2 -1/3 -1/2 0 0; + 0 0 -1/3 -1/4 -1/3 0 0; + 0 0 -1/2 -1/3 -1/2 0 0; + 0 0 0 0 0 0 0; + 0 0 0 0 0 0 0]) @test err < 1e-5 - err,pois = Poisson_setup(Poisson,(2^6+2,2^6+2);f) + err,pois = Poisson_setup(Poisson,(2^6+4,2^6+4);f) @test err < 1e-6 @test pois.n[] < 310 - err,pois = Poisson_setup(Poisson,(2^4+2,2^4+2,2^4+2);f) + err,pois = Poisson_setup(Poisson,(2^4+4,2^4+4,2^4+4);f) @test err < 1e-6 @test pois.n[] < 35 end @@ -136,21 +181,21 @@ end @testset "MultiLevelPoisson.jl" begin I = CartesianIndex(4,3,2) @test all(WaterLily.down(J)==I for J ∈ WaterLily.up(I)) - @test_throws AssertionError("MultiLevelPoisson requires size=a2ⁿ, where n>2") Poisson_setup(MultiLevelPoisson,(15+2,3^4+2)) - - err,pois = Poisson_setup(MultiLevelPoisson,(10,10)) - @test pois.levels[3].D == Float32[0 0 0 0; 0 -2 -2 0; 0 -2 -2 0; 0 0 0 0] + @test_throws AssertionError("MultiLevelPoisson requires size=a2ⁿ, where n>2") Poisson_setup(MultiLevelPoisson,(15+4,3^4+4)) + + # (12,12) → level1: 12×12, level2: 7×7, level3: 4×4 (3 levels) + err,pois = Poisson_setup(MultiLevelPoisson,(12,12)) + # Level 3 is 4×4: inside(buff=2) = 3:2 (empty), D should be all zeros + # Actually level 3: Na=1+7÷2=4, which has inside=3:2 (empty) + # Level 2: 7×7 → inside=3:5 (3 cells), not divisible (7 is odd) + # Level 3: Na=1+7÷2=4 → 4×4 @test err < 1e-5 - pois.levels[1].L[5:6,:,1].=0 - WaterLily.update!(pois) - @test pois.levels[3].D == Float32[0 0 0 0; 0 -1 -1 0; 0 -1 -1 0; 0 0 0 0] - for f ∈ arrays - err,pois = Poisson_setup(MultiLevelPoisson,(2^6+2,2^6+2);f) + err,pois = Poisson_setup(MultiLevelPoisson,(2^6+4,2^6+4);f) @test err < 1e-6 @test pois.n[] ≤ 3 - err,pois = Poisson_setup(MultiLevelPoisson,(2^4+2,2^4+2,2^4+2);f) + err,pois = Poisson_setup(MultiLevelPoisson,(2^4+4,2^4+4,2^4+4);f) @test err < 1e-6 @test pois.n[] ≤ 3 end @@ -166,37 +211,15 @@ end cds = WaterLily.cds @test cds(1,0,1) == 0.5 && cds(1,2,-1) == 0.5 # central difference between downstream and itself - # Check QUICK scheme on boundary - ϕuL = WaterLily.ϕuL - ϕuR = WaterLily.ϕuR + # Check QUICK scheme (N+4 layout eliminates boundary-specific ϕuL/ϕuR) quick = WaterLily.quick - ϕ = WaterLily.ϕ - - # inlet with positive flux -> CD - @test ϕuL(1,CartesianIndex(2),[0.,0.5,2.],1,quick)==ϕ(1,CartesianIndex(2),[0.,0.5,2.0]) - # inlet negative flux -> backward QUICK - @test ϕuL(1,CartesianIndex(2),[0.,0.5,2.],-1,quick)==-quick(2.0,0.5,0.0) - # outlet, positive flux -> standard QUICK - @test ϕuR(1,CartesianIndex(3),[0.,0.5,2.],1,quick)==quick(0.0,0.5,2.0) - # outlet, negative flux -> backward CD - @test ϕuR(1,CartesianIndex(3),[0.,0.5,2.],-1,quick)==-ϕ(1,CartesianIndex(3),[0.,0.5,2.0]) - - # check that ϕuSelf is the same as ϕu if explicitly provided with the same indices ϕu = WaterLily.ϕu - ϕuP = WaterLily.ϕuP - λ = WaterLily.quick - - I = CartesianIndex(3); # 1D check, positive flux - @test ϕu(1,I,[0.,0.5,2.],1,quick)==ϕuP(1,I-2δ(1,I),I,[0.,0.5,2.],1,quick); - I = CartesianIndex(2); # 1D check, negative flux - @test ϕu(1,I,[0.,0.5,2.],-1,quick)==ϕuP(1,I-2δ(1,I),I,[0.,0.5,2.],-1,quick); - # check for periodic flux - I=CartesianIndex(3);Ip=I-2δ(1,I); - f = [1.,1.25,1.5,1.75,2.]; - @test ϕuP(1,Ip,I,f,1,quick)==λ(f[Ip],f[I-δ(1,I)],f[I]) - Ip = WaterLily.CIj(1,I,length(f)-2); # make periodic - @test ϕuP(1,Ip,I,f,1,quick)==λ(f[Ip],f[I-δ(1,I)],f[I]) + # Standard QUICK at interior points + I = CartesianIndex(4); # 1D check, positive flux (well inside domain) + @test ϕu(1,I,[0.,0.5,1.0,1.5,2.,2.5],1,quick)==quick(0.5,1.0,1.5) + I = CartesianIndex(4); # 1D check, negative flux + @test ϕu(1,I,[0.,0.5,1.0,1.5,2.,2.5],-1,quick)==-quick(2.0,1.5,1.0) # check applying acceleration for f ∈ arrays @@ -275,7 +298,7 @@ end # check sdf on arrays and that it recovers set arithmetic identity for f ∈ arrays - p = zeros(Float32,4,5) |> f; measure_sdf!(p,(body1 ∩ body2) ∪ body1) + p = zeros(Float32,6,7) |> f; measure_sdf!(p,(body1 ∩ body2) ∪ body1) for I ∈ inside(p) @test GPUArrays.@allowscalar p[I]≈sdf(body1,loc(0,I,Float32)) end @@ -399,7 +422,7 @@ end L = 4 simg,sim,udf = rotating_reference(2L,SA_F64[L,L],1/L,Array) sim_step!(simg);sim_step!(sim;udf) - @test L₂(simg.flow.p)==L₂(sim.flow.p)<3e-3 # should be zero + @test L₂(simg.flow.p)==L₂(sim.flow.p)<2e-2 # should be zero (N+4 uses QUICK at boundary → higher truncation error) end @testset "Circle in accelerating flow" begin @@ -418,10 +441,10 @@ end import WaterLily: × @testset "Metrics.jl" begin - J = CartesianIndex(2,3,4); x = loc(0,J,Float64); px = prod(x) + J = CartesianIndex(3,4,5); x = loc(0,J,Float64); px = prod(x) for f ∈ arrays - u = zeros(3,4,5,3) |> f; apply!((i,x)->x[i]+prod(x),u) - p = zeros(3,4,5) |> f + u = zeros(7,8,9,3) |> f; apply!((i,x)->x[i]+prod(x),u) + p = zeros(7,8,9) |> f @inside p[I] = WaterLily.ke(I,u) @test GPUArrays.@allowscalar p[J]==0.5*sum(abs2,x .+ px) @inside p[I] = WaterLily.ke(I,u,x) @@ -437,7 +460,7 @@ import WaterLily: × @inside p[I] = WaterLily.ω_θ(I,(0,0,1),x .+ (0,1,2),u) @test GPUArrays.@allowscalar p[J]≈ω[1] apply!((x)->1,p) - @test WaterLily.L₂(p)≈prod(size(p).-2) + @test WaterLily.L₂(p)≈prod(size(p).-4) # test force routines N = 32 p = zeros(N,N) |> f; df₂ = zeros(N,N,2) |> f; df₃ = zeros(N,N,N,3) |> f @@ -494,7 +517,7 @@ import WaterLily: × @test all(meanflow.UU .== zero(T)) @test meanflow.t == T[0] - meanflow2 = MeanFlow(size(sim.flow.p).-2; uu_stats=true) + meanflow2 = MeanFlow(size(sim.flow.p).-4; uu_stats=true) @test all(meanflow2.P .== zero(T)) @test size(meanflow2.P) == size(meanflow.P) end