diff --git a/Project.toml b/Project.toml index 92fb96c6..70e91f98 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PointNeighbors" uuid = "1c4d5385-0a27-49de-8e2c-43b175c8985c" authors = ["Erik Faulhaber "] -version = "0.5" +version = "0.4.9-dev" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" diff --git a/benchmarks/count_neighbors.jl b/benchmarks/count_neighbors.jl index 9f73add7..6ed4c146 100644 --- a/benchmarks/count_neighbors.jl +++ b/benchmarks/count_neighbors.jl @@ -12,18 +12,20 @@ implementations are highlighted. On the other hand, this is the least realistic For a computationally heavier benchmark, see [`benchmark_n_body`](@ref). """ -function benchmark_count_neighbors(neighborhood_search, coordinates; parallel = true) +function benchmark_count_neighbors(neighborhood_search, coordinates; + parallelization_backend = default_backend(coordinates)) n_neighbors = zeros(Int, size(coordinates, 2)) - function count_neighbors!(n_neighbors, coordinates, neighborhood_search, parallel) + function count_neighbors!(n_neighbors, coordinates, neighborhood_search, + parallelization_backend) n_neighbors .= 0 - foreach_point_neighbor(coordinates, coordinates, neighborhood_search, - parallel = parallel) do i, _, _, _ + foreach_point_neighbor(coordinates, coordinates, neighborhood_search; + parallelization_backend) do i, _, _, _ n_neighbors[i] += 1 end end return @belapsed $count_neighbors!($n_neighbors, $coordinates, - $neighborhood_search, $parallel) + $neighborhood_search, $parallelization_backend) end diff --git a/benchmarks/n_body.jl b/benchmarks/n_body.jl index 6cd02721..66650cce 100644 --- a/benchmarks/n_body.jl +++ b/benchmarks/n_body.jl @@ -12,12 +12,13 @@ This is a more realistic benchmark for particle-based simulations than However, due to the higher computational cost, differences between neighborhood search implementations are less pronounced. """ -function benchmark_n_body(neighborhood_search, coordinates_; parallel = true) +function benchmark_n_body(neighborhood_search, coordinates_; + parallelization_backend = default_backend(coordinates_)) # Passing a different backend like `CUDA.CUDABackend` # allows us to change the type of the array to run the benchmark on the GPU. # Passing `parallel = true` or `parallel = false` will not change anything here. - coordinates = PointNeighbors.Adapt.adapt(parallel, coordinates_) - nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search) + coordinates = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates_) + nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search) # This preserves the data type of `coordinates`, which makes it work for GPU types mass = 1e10 * (rand!(similar(coordinates, size(coordinates, 2))) .+ 1) @@ -25,11 +26,12 @@ function benchmark_n_body(neighborhood_search, coordinates_; parallel = true) dv = similar(coordinates) - function compute_acceleration!(dv, coordinates, mass, G, neighborhood_search, parallel) + function compute_acceleration!(dv, coordinates, mass, G, neighborhood_search, + parallelization_backend) dv .= 0.0 - foreach_point_neighbor(coordinates, coordinates, neighborhood_search, - parallel = parallel) do i, j, pos_diff, distance + foreach_point_neighbor(coordinates, coordinates, neighborhood_search; + parallelization_backend) do i, j, pos_diff, distance # Only consider particles with a distance > 0 distance < sqrt(eps()) && return @@ -43,5 +45,6 @@ function benchmark_n_body(neighborhood_search, coordinates_; parallel = true) return dv end - return @belapsed $compute_acceleration!($dv, $coordinates, $mass, $G, $nhs, $parallel) + return @belapsed $compute_acceleration!($dv, $coordinates, $mass, $G, $nhs, + $parallelization_backend) end diff --git a/benchmarks/plot.jl b/benchmarks/plot.jl index 976cf045..bd7588c3 100644 --- a/benchmarks/plot.jl +++ b/benchmarks/plot.jl @@ -36,7 +36,7 @@ include("benchmarks/benchmarks.jl") plot_benchmarks(benchmark_count_neighbors, (10, 10), 3) """ function plot_benchmarks(benchmark, n_points_per_dimension, iterations; - parallel = true, title = "", + parallelization_backend = PolyesterBackend(), title = "", seed = 1, perturbation_factor_position = 1.0) neighborhood_searches_names = ["TrivialNeighborhoodSearch";; "GridNeighborhoodSearch";; @@ -69,7 +69,7 @@ function plot_benchmarks(benchmark, n_points_per_dimension, iterations; neighborhood_search = neighborhood_searches[i] initialize!(neighborhood_search, coordinates, coordinates) - time = benchmark(neighborhood_search, coordinates, parallel = parallel) + time = benchmark(neighborhood_search, coordinates; parallelization_backend) times[iter, i] = time time_string = BenchmarkTools.prettytime(time * 1e9) println("$(neighborhood_searches_names[i])") diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl index 2ccfb77d..4dfae330 100644 --- a/benchmarks/smoothed_particle_hydrodynamics.jl +++ b/benchmarks/smoothed_particle_hydrodynamics.jl @@ -9,7 +9,8 @@ A benchmark of the right-hand side of a full real-life Weakly Compressible Smoothed Particle Hydrodynamics (WCSPH) simulation with TrixiParticles.jl. This method is used to simulate an incompressible fluid. """ -function benchmark_wcsph(neighborhood_search, coordinates; parallel = true) +function benchmark_wcsph(neighborhood_search, coordinates; + parallelization_backend = default_backend(coordinates)) density = 1000.0 fluid = InitialCondition(; coordinates, density, mass = 0.1) @@ -34,19 +35,12 @@ function benchmark_wcsph(neighborhood_search, coordinates; parallel = true) smoothing_length, viscosity = viscosity, density_diffusion = density_diffusion) - # Note that we cannot just disable parallelism in TrixiParticles. - # But passing a different backend like `CUDA.CUDABackend` - # allows us to change the type of the array to run the benchmark on the GPU. - if parallel isa Bool - system = fluid_system - nhs = neighborhood_search - else - system = PointNeighbors.Adapt.adapt(parallel, fluid_system) - nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search) - end + system = PointNeighbors.Adapt.adapt(parallelization_backend, fluid_system) + nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search) - v = PointNeighbors.Adapt.adapt(parallel, vcat(fluid.velocity, fluid.density')) - u = PointNeighbors.Adapt.adapt(parallel, coordinates) + v = PointNeighbors.Adapt.adapt(parallelization_backend, + vcat(fluid.velocity, fluid.density')) + u = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates) dv = zero(v) # Initialize the system @@ -61,7 +55,8 @@ end Like [`benchmark_wcsph`](@ref), but using single precision floating point numbers. """ -function benchmark_wcsph_fp32(neighborhood_search, coordinates_; parallel = true) +function benchmark_wcsph_fp32(neighborhood_search, coordinates_; + parallelization_backend = default_backend(coordinates_)) coordinates = convert(Matrix{Float32}, coordinates_) density = 1000.0f0 fluid = InitialCondition(; coordinates, density, mass = 0.1f0) @@ -88,19 +83,12 @@ function benchmark_wcsph_fp32(neighborhood_search, coordinates_; parallel = true acceleration = (0.0f0, 0.0f0, 0.0f0), density_diffusion = density_diffusion) - # Note that we cannot just disable parallelism in TrixiParticles. - # But passing a different backend like `CUDA.CUDABackend` - # allows us to change the type of the array to run the benchmark on the GPU. - if parallel isa Bool - system = fluid_system - nhs = neighborhood_search - else - system = PointNeighbors.Adapt.adapt(parallel, fluid_system) - nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search) - end + system = PointNeighbors.Adapt.adapt(parallelization_backend, fluid_system) + nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search) - v = PointNeighbors.Adapt.adapt(parallel, vcat(fluid.velocity, fluid.density')) - u = PointNeighbors.Adapt.adapt(parallel, coordinates) + v = PointNeighbors.Adapt.adapt(parallelization_backend, + vcat(fluid.velocity, fluid.density')) + u = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates) dv = zero(v) # Initialize the system @@ -117,7 +105,8 @@ A benchmark of the right-hand side of a full real-life Total Lagrangian Smoothed Particle Hydrodynamics (TLSPH) simulation with TrixiParticles.jl. This method is used to simulate an elastic structure. """ -function benchmark_tlsph(neighborhood_search, coordinates; parallel = true) +function benchmark_tlsph(neighborhood_search, coordinates; + parallelization_backend = default_backend(coordinates)) material = (density = 1000.0, E = 1.4e6, nu = 0.4) solid = InitialCondition(; coordinates, density = material.density, mass = 0.1) diff --git a/benchmarks/update.jl b/benchmarks/update.jl index 2fce971f..79904700 100644 --- a/benchmarks/update.jl +++ b/benchmarks/update.jl @@ -9,7 +9,8 @@ include("../test/point_cloud.jl") Benchmark neighborhood search initialization with the given `coordinates`. """ -function benchmark_initialize(neighborhood_search, coordinates; parallel = true) +function benchmark_initialize(neighborhood_search, coordinates; + parallelization_backend = default_backend(coordinates)) return @belapsed $initialize!($neighborhood_search, $coordinates, $coordinates) end @@ -21,7 +22,8 @@ perturbed point clouds. This is a good benchmark for incremental updates, since most particles stay in their cells. """ -function benchmark_update_alternating(neighborhood_search, coordinates; parallel = true) +function benchmark_update_alternating(neighborhood_search, coordinates; + parallelization_backend = default_backend(coordinates)) coordinates2 = copy(coordinates) # Perturb all coordinates with a perturbation factor of `0.015`. # This factor was tuned so that ~0.5% of the particles change their cell during an diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index 979230ac..f6458409 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -11,8 +11,8 @@ using LinearAlgebra: dot using Polyester: Polyester @reexport using StaticArrays: SVector -include("util.jl") include("vector_of_vectors.jl") +include("util.jl") include("neighborhood_search.jl") include("nhs_trivial.jl") include("cell_lists/cell_lists.jl") @@ -25,9 +25,10 @@ export TrivialNeighborhoodSearch, GridNeighborhoodSearch, PrecomputedNeighborhoo export DictionaryCellList, FullGridCellList export ParallelUpdate, SemiParallelUpdate, SerialIncrementalUpdate, SerialUpdate, ParallelIncrementalUpdate -export requires_update, requires_resizing +export requires_update export initialize!, update!, initialize_grid!, update_grid! -export PolyesterBackend, ThreadsDynamicBackend, ThreadsStaticBackend +export SerialBackend, PolyesterBackend, ThreadsDynamicBackend, ThreadsStaticBackend, + default_backend export PeriodicBox, copy_neighborhood_search end # module PointNeighbors diff --git a/src/cell_lists/full_grid.jl b/src/cell_lists/full_grid.jl index 552b2634..be68440e 100644 --- a/src/cell_lists/full_grid.jl +++ b/src/cell_lists/full_grid.jl @@ -126,7 +126,7 @@ function Base.empty!(cell_list::FullGridCellList) (; cells) = cell_list # `Base.empty!.(cells)`, but for all backends - for i in eachindex(cells) + @threaded default_backend(cells) for i in eachindex(cells) emptyat!(cells, i) end @@ -225,6 +225,22 @@ end @inline function check_cell_bounds(cell_list::FullGridCellList, cell::Tuple) (; linear_indices) = cell_list + # Make sure that points are not added to the outer padding layer, which is needed + # to ensure that neighboring cells in all directions of all non-empty cells exist. + if !all(cell[i] in 2:(size(linear_indices, i) - 1) for i in eachindex(cell)) + size_ = [2:(size(linear_indices, i) - 1) for i in eachindex(cell)] + print_size_ = "[$(join(size_, ", "))]" + error("particle coordinates are NaN or outside the domain bounds of the cell list\n" * + "cell $cell is out of bounds for cell grid of size $print_size_") + end +end + +# On GPUs, we can't throw a proper error message because string interpolation is not allowed +@inline function check_cell_bounds(cell_list::FullGridCellList{<:DynamicVectorOfVectors{<:Any, + <:AbstractGPUArray}}, + cell::Tuple) + (; linear_indices) = cell_list + # Make sure that points are not added to the outer padding layer, which is needed # to ensure that neighboring cells in all directions of all non-empty cells exist. if !all(cell[i] in 2:(size(linear_indices, i) - 1) for i in eachindex(cell)) diff --git a/src/gpu.jl b/src/gpu.jl index 9293e19f..cd418c2e 100644 --- a/src/gpu.jl +++ b/src/gpu.jl @@ -31,6 +31,3 @@ function Adapt.adapt_structure(to, nhs::GridNeighborhoodSearch) return GridNeighborhoodSearch(cell_list, search_radius, periodic_box, n_cells, cell_size, update_buffer, nhs.update_strategy) end - -# This is useful to pass the backend directly to `@threaded` -KernelAbstractions.get_backend(backend::KernelAbstractions.Backend) = backend diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index 5a805ab9..e035720b 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -13,17 +13,8 @@ function requires_update(::AbstractNeighborhoodSearch) end """ - requires_resizing(search::AbstractNeighborhoodSearch) - -Returns `false` if the neighborhood search can be updated with a different number -of neighbor points (`y`) without re-initializing it. -""" -function requires_resizing(::AbstractNeighborhoodSearch) - error("`requires_resizing` not implemented for this neighborhood search.") -end - -""" - initialize!(search::AbstractNeighborhoodSearch, x, y) + initialize!(search::AbstractNeighborhoodSearch, x, y; + parallelization_backend = default_backend(x)) Initialize a neighborhood search with the two coordinate arrays `x` and `y`. @@ -32,19 +23,27 @@ all points in `y` whose distances to that point are smaller than the search radi `x` and `y` are expected to be matrices, where the `i`-th column contains the coordinates of point `i`. Note that `x` and `y` can be identical. +If the neighborhood search type supports parallelization, the keyword argument +`parallelization_backend` can be used to specify a parallelization backend. +See [`@threaded`](@ref) for a list of available backends. + See also [`update!`](@ref). """ -@inline initialize!(search::AbstractNeighborhoodSearch, x, y) = search +@inline function initialize!(search::AbstractNeighborhoodSearch, x, y; + parallelization_backend = default_backend(x)) + return search +end """ - update!(search::AbstractNeighborhoodSearch, x, y; points_moving = (true, true)) + update!(search::AbstractNeighborhoodSearch, x, y; points_moving = (true, true), + parallelization_backend = default_backend(x)) Update an already initialized neighborhood search with the two coordinate arrays `x` and `y`. -Like [`initialize!`](@ref), but reusing the existing data structures of the already -initialized neighborhood search. +Like [`initialize!`](@ref), but potentially reusing the existing data structures +of the already initialized neighborhood search. When the points only moved a small distance since the last `update!` or `initialize!`, -this is significantly faster than `initialize!`. +this can be significantly faster than `initialize!`. Not all implementations support incremental updates. If incremental updates are not possible for an implementation, `update!` will fall back @@ -56,20 +55,15 @@ in this case to avoid unnecessary updates. The first flag in `points_moving` indicates if points in `x` are moving. The second flag indicates if points in `y` are moving. -!!! warning "Experimental Feature: Backend Specification" - The keyword argument `parallelization_backend` allows users to specify the - multithreading backend. This feature is currently considered experimental! - - Possible parallelization backends are: - - [`ThreadsDynamicBackend`](@ref) to use `Threads.@threads :dynamic` - - [`ThreadsStaticBackend`](@ref) to use `Threads.@threads :static` - - [`PolyesterBackend`](@ref) to use `Polyester.@batch` - - `KernelAbstractions.Backend` to launch a GPU kernel +If the neighborhood search type supports parallelization, the keyword argument +`parallelization_backend` can be used to specify a parallelization backend. +See [`@threaded`](@ref) for a list of available backends. See also [`initialize!`](@ref). """ @inline function update!(search::AbstractNeighborhoodSearch, x, y; - points_moving = (true, true), parallelization_backend = x) + points_moving = (true, true), + parallelization_backend = default_backend(x)) return search end @@ -129,7 +123,8 @@ end """ foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search; - points = axes(system_coords, 2), parallel = true) + parallelization_backend = default_backend(system_coords), + points = axes(system_coords, 2)) Loop for each point in `system_coords` over all points in `neighbor_coords` whose distances to that point are smaller than the search radius and execute the function `f(i, j, pos_diff, d)`, @@ -140,11 +135,13 @@ where (`system_coords[:, i]`) and ``y`` the coordinates of the neighbor (`neighbor_coords[:, j]`), - `d` the distance between `x` and `y`. -The `neighborhood_search` must have been initialized or updated with `system_coords` -as first coordinate array and `neighbor_coords` as second coordinate array. - Note that `system_coords` and `neighbor_coords` can be identical. +!!! warning + The `neighborhood_search` must have been initialized or updated with `system_coords` + as first coordinate array and `neighbor_coords` as second coordinate array. + This can be skipped for certain implementations. See [`requires_update`](@ref). + # Arguments - `f`: The function explained above. - `system_coords`: A matrix where the `i`-th column contains the coordinates of point `i`. @@ -155,70 +152,26 @@ Note that `system_coords` and `neighbor_coords` can be identical. # Keywords - `points`: Loop over these point indices. By default all columns of `system_coords`. -- `parallel=true`: Run the outer loop over `points` thread-parallel. +- `parallelization_backend`: Run the outer loop over `points` in parallel with the + specified backend. By default, the backend is selected + automatically based on the type of `system_coords`. + See [`@threaded`](@ref) for a list of available backends. See also [`initialize!`](@ref), [`update!`](@ref). """ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search; - parallel::Union{Bool, ParallelizationBackend} = true, + parallelization_backend::ParallelizationBackend = default_backend(system_coords), points = axes(system_coords, 2)) where {T} # The type annotation above is to make Julia specialize on the type of the function. # Otherwise, unspecialized code will cause a lot of allocations # and heavily impact performance. # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing - if parallel isa Bool - # When `false` is passed, run serially. When `true` is passed, run either a - # threaded loop with `Polyester.@batch`, or, when `system_coords` is a GPU array, - # launch the loop as a kernel on the GPU. - parallel_ = Val(parallel) - elseif parallel isa ParallelizationBackend - # When a `KernelAbstractions.Backend` is passed, launch the loop as a GPU kernel - # on this backend. This is useful to test the GPU code on the CPU by passing - # `parallel = KernelAbstractions.CPU()`, even though `system_coords isa Array`. - parallel_ = parallel - end - - foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points, - parallel_) -end -@inline function foreach_point_neighbor(f, system_coords, neighbor_coords, - neighborhood_search, points, parallel::Val{true}) # Explicit bounds check before the hot loop (or GPU kernel) @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points) - @threaded system_coords for point in points - # Now we can assume that `point` is inbounds - @inbounds foreach_neighbor(f, system_coords, neighbor_coords, - neighborhood_search, point) - end - - return nothing -end - -# When a `KernelAbstractions.Backend` is passed, launch a GPU kernel on this backend -@inline function foreach_point_neighbor(f, system_coords, neighbor_coords, - neighborhood_search, points, - backend::ParallelizationBackend) - # Explicit bounds check before the hot loop (or GPU kernel) - @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points) - - @threaded backend for point in points - # Now we can assume that `point` is inbounds - @inbounds foreach_neighbor(f, system_coords, neighbor_coords, - neighborhood_search, point) - end - - return nothing -end - -@inline function foreach_point_neighbor(f, system_coords, neighbor_coords, - neighborhood_search, points, parallel::Val{false}) - # Explicit bounds check before the hot loop - @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points) - - for point in points - # Now we can assume that `point` is inbounds + @threaded parallelization_backend for point in points + # Now we can safely assume that `point` is inbounds @inbounds foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search, point) end diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index f46316d5..e29ac8c9 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -166,18 +166,6 @@ See [`GridNeighborhoodSearch`](@ref) for usage information. """ struct SerialUpdate end -@inline function requires_resizing(::GridNeighborhoodSearch{<:Any, ParallelUpdate}) - # Update is just a re-initialization, so no re-initialization is needed - # when the number of points changes. - return false -end - -@inline function requires_resizing(::GridNeighborhoodSearch) - # Incremental update strategies require re-initialization - # when the number of points changes. - return true -end - # No update buffer needed for non-incremental update/initialize @inline function create_update_buffer(::Union{SerialUpdate, ParallelUpdate}, _, _) return nothing @@ -205,12 +193,13 @@ end end function initialize!(neighborhood_search::GridNeighborhoodSearch, - x::AbstractMatrix, y::AbstractMatrix) - initialize_grid!(neighborhood_search, y) + x::AbstractMatrix, y::AbstractMatrix; + parallelization_backend = default_backend(x)) + initialize_grid!(neighborhood_search, y; parallelization_backend) end function initialize_grid!(neighborhood_search::GridNeighborhoodSearch, y::AbstractMatrix; - parallelization_backend = y) + parallelization_backend = default_backend(y)) (; cell_list) = neighborhood_search empty!(cell_list) @@ -221,9 +210,10 @@ function initialize_grid!(neighborhood_search::GridNeighborhoodSearch, y::Abstra return neighborhood_search end + # Ignore the parallelization backend here. This cannot be parallelized. for point in axes(y, 2) # Get cell index of the point's cell - point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) + point_coords = @inbounds extract_svector(y, Val(ndims(neighborhood_search)), point) cell = cell_coords(point_coords, neighborhood_search) # Add point to corresponding cell @@ -233,14 +223,9 @@ function initialize_grid!(neighborhood_search::GridNeighborhoodSearch, y::Abstra return neighborhood_search end -# WARNING! Experimental feature: -# By default, determine the parallelization backend from the type of `y`. -# Optionally, pass a `KernelAbstractions.Backend` to run the KernelAbstractions.jl code -# on this backend. This can be useful to run GPU kernels on the CPU by passing -# `parallelization_backend = KernelAbstractions.CPU()`, even though `y isa Array`. function initialize_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, ParallelUpdate}, - y::AbstractMatrix; parallelization_backend = y) + y::AbstractMatrix; parallelization_backend = default_backend(y)) (; cell_list) = neighborhood_search empty!(cell_list) @@ -263,14 +248,9 @@ function initialize_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, return neighborhood_search end -# WARNING! Experimental feature: -# By default, determine the parallelization backend from the type of `x`. -# Optionally, pass a `KernelAbstractions.Backend` to run the KernelAbstractions.jl code -# on this backend. This can be useful to run GPU kernels on the CPU by passing -# `parallelization_backend = KernelAbstractions.CPU()`, even though `x isa Array`. function update!(neighborhood_search::GridNeighborhoodSearch, x::AbstractMatrix, y::AbstractMatrix; - points_moving = (true, true), parallelization_backend = x) + points_moving = (true, true), parallelization_backend = default_backend(x)) # The coordinates of the first set of points are irrelevant for this NHS. # Only update when the second set is moving. points_moving[2] || return neighborhood_search @@ -279,19 +259,8 @@ function update!(neighborhood_search::GridNeighborhoodSearch, end # Update only with neighbor coordinates -function update_grid!(neighborhood_search::GridNeighborhoodSearch{NDIMS}, - y::AbstractMatrix; parallelization_backend = y) where {NDIMS} - update_grid!(neighborhood_search, i -> extract_svector(y, Val(NDIMS), i); - parallelization_backend) -end - -# Serial and semi-parallel update. -# See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`. -function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any, - SerialIncrementalUpdate}, - GridNeighborhoodSearch{<:Any, - SemiParallelUpdate}}, - coords_fun::Function; parallelization_backend = nothing) +function update_grid!(neighborhood_search::GridNeighborhoodSearch{NDIMS}, y::AbstractMatrix; + parallelization_backend = default_backend(y)) where {NDIMS} (; cell_list, update_buffer) = neighborhood_search # Empty each thread's list @@ -301,7 +270,7 @@ function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any, # Find all cells containing points that now belong to another cell. # This loop is threaded for `update_strategy == SemiParallelUpdate()`. - mark_changed_cells!(neighborhood_search, coords_fun, parallelization_backend) + mark_changed_cells!(neighborhood_search, y, parallelization_backend) # Iterate over all marked cells and move the points into their new cells. # This is always a serial loop (hence "semi-parallel"). @@ -316,11 +285,10 @@ function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any, # `deleteat_cell!(..., i)` will change the order of points that come after `i`. for i in reverse(eachindex(points)) point = points[i] - cell_coords_ = cell_coords(coords_fun(point), neighborhood_search) - - if !is_correct_cell(cell_list, cell_coords_, cell_index) - new_cell_coords = cell_coords(coords_fun(point), neighborhood_search) + point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) + new_cell_coords = cell_coords(point_coords, neighborhood_search) + if !is_correct_cell(cell_list, new_cell_coords, cell_index) # Add point to new cell or create cell if it does not exist push_cell!(cell_list, new_cell_coords, point) @@ -339,31 +307,33 @@ end # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing @inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any, SemiParallelUpdate}, - coords_fun::T, parallelization_backend) where {T} + y, parallelization_backend) where {T} (; cell_list) = neighborhood_search # `each_cell_index(cell_list)` might return a `KeySet`, which has to be `collect`ed # first to be able to be used in a threaded loop. This function takes care of that. @threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list) - mark_changed_cell!(neighborhood_search, cell_index, coords_fun) + mark_changed_cell!(neighborhood_search, cell_index, y) end end @inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any, SerialIncrementalUpdate}, - coords_fun::T, _) where {T} + y, _) where {T} (; cell_list) = neighborhood_search + # Ignore the parallelization backend here for `SerialIncrementalUpdate`. for cell_index in each_cell_index(cell_list) - mark_changed_cell!(neighborhood_search, cell_index, coords_fun) + mark_changed_cell!(neighborhood_search, cell_index, y) end end -@inline function mark_changed_cell!(neighborhood_search, cell_index, coords_fun) +@inline function mark_changed_cell!(neighborhood_search, cell_index, y) (; cell_list, update_buffer) = neighborhood_search for point in cell_list[cell_index] - cell = cell_coords(coords_fun(point), neighborhood_search) + point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) + cell = cell_coords(point_coords, neighborhood_search) # `cell` is a tuple, `cell_index` is the linear index used internally by the # cell list to store cells inside `cell`. @@ -376,11 +346,10 @@ end end end -# Fully parallel update with atomic push. -# See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`. +# Fully parallel incremental update with atomic push. function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, ParallelIncrementalUpdate}, - coords_fun::Function; parallelization_backend = nothing) + y::AbstractMatrix; parallelization_backend = default_backend(y)) (; cell_list, update_buffer) = neighborhood_search # Note that we need two separate loops for adding and removing points. @@ -399,11 +368,10 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, @threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list) for i in 1:update_buffer[cell_index] point = cell_list.cells.backend[i, cell_index] - cell_coords_ = cell_coords(coords_fun(point), neighborhood_search) - - if !is_correct_cell(cell_list, cell_coords_, cell_index) - new_cell_coords = cell_coords(coords_fun(point), neighborhood_search) + point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) + new_cell_coords = cell_coords(point_coords, neighborhood_search) + if !is_correct_cell(cell_list, new_cell_coords, cell_index) # Add point to new cell or create cell if it does not exist push_cell_atomic!(cell_list, new_cell_coords, point) end @@ -419,9 +387,10 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, # `deleteat_cell!(..., i)` will change the order of points that come after `i`. for i in reverse(eachindex(points)) point = points[i] - cell_coords_ = cell_coords(coords_fun(point), neighborhood_search) + point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point) + new_cell_coords = cell_coords(point_coords, neighborhood_search) - if !is_correct_cell(cell_list, cell_coords_, cell_index) + if !is_correct_cell(cell_list, new_cell_coords, cell_index) # Remove moved point from this cell deleteat_cell!(cell_list, cell_index, i) end @@ -431,13 +400,12 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, return neighborhood_search end -# Note that this is only defined when a matrix `y` is passed. When updating with a function, -# it will fall back to the semi-parallel update. +# Non-incremental update strategies just forward to `initialize_grid!` function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any, ParallelUpdate}, GridNeighborhoodSearch{<:Any, SerialUpdate}}, - y::AbstractMatrix; parallelization_backend = y) + y::AbstractMatrix; parallelization_backend = default_backend(y)) initialize_grid!(neighborhood_search, y; parallelization_backend) end diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index cb58cfb3..e46926d0 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -47,22 +47,20 @@ end @inline requires_update(::PrecomputedNeighborhoodSearch) = (true, true) -@inline function requires_resizing(search::PrecomputedNeighborhoodSearch) - return requires_resizing(search.neighborhood_search) -end - @inline function search_radius(search::PrecomputedNeighborhoodSearch) return search_radius(search.neighborhood_search) end function initialize!(search::PrecomputedNeighborhoodSearch, - x::AbstractMatrix, y::AbstractMatrix) + x::AbstractMatrix, y::AbstractMatrix; + parallelization_backend = default_backend(x)) (; neighborhood_search, neighbor_lists) = search # Initialize grid NHS - initialize!(neighborhood_search, x, y) + initialize!(neighborhood_search, x, y; parallelization_backend) - initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y) + initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y, + parallelization_backend) end # WARNING! Experimental feature: @@ -72,7 +70,7 @@ end # `parallelization_backend = KernelAbstractions.CPU()`, even though `x isa Array`. function update!(search::PrecomputedNeighborhoodSearch, x::AbstractMatrix, y::AbstractMatrix; - points_moving = (true, true), parallelization_backend = x) + points_moving = (true, true), parallelization_backend = default_backend(x)) (; neighborhood_search, neighbor_lists) = search # Update grid NHS @@ -80,11 +78,13 @@ function update!(search::PrecomputedNeighborhoodSearch, # Skip update if both point sets are static if any(points_moving) - initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y) + initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y, + parallelization_backend) end end -function initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y) +function initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y, + parallelization_backend) # Initialize neighbor lists empty!(neighbor_lists) resize!(neighbor_lists, size(x, 2)) @@ -93,7 +93,8 @@ function initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y) end # Fill neighbor lists - foreach_point_neighbor(x, y, neighborhood_search) do point, neighbor, _, _ + foreach_point_neighbor(x, y, neighborhood_search; + parallelization_backend) do point, neighbor, _, _ push!(neighbor_lists[point], neighbor) end end diff --git a/src/nhs_trivial.jl b/src/nhs_trivial.jl index fbc57631..9e7c564b 100644 --- a/src/nhs_trivial.jl +++ b/src/nhs_trivial.jl @@ -32,12 +32,14 @@ end @inline requires_update(::TrivialNeighborhoodSearch) = (false, false) -@inline requires_resizing(::TrivialNeighborhoodSearch) = false - -@inline initialize!(search::TrivialNeighborhoodSearch, x, y) = search +@inline function initialize!(search::TrivialNeighborhoodSearch, x, y; + parallelization_backend = default_backend(x)) + return search +end @inline function update!(search::TrivialNeighborhoodSearch, x, y; - points_moving = (true, true), parallelization_backend = x) + points_moving = (true, true), + parallelization_backend = default_backend(x)) return search end diff --git a/src/util.jl b/src/util.jl index 7cfbfedc..a188c80b 100644 --- a/src/util.jl +++ b/src/util.jl @@ -35,6 +35,13 @@ end abstract type AbstractThreadingBackend end +""" + SerialBackend() + +Pass as first argument to the [`@threaded`](@ref) macro to run the loop serially. +""" +struct SerialBackend <: AbstractThreadingBackend end + """ PolyesterBackend() @@ -63,32 +70,41 @@ struct ThreadsStaticBackend <: AbstractThreadingBackend end const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend} """ - @threaded x for ... end + default_backend(x) -Run either a threaded CPU loop or launch a kernel on the GPU, depending on the type of `x`. +Select the recommended backend for an array `x`. +This allows to write generic code that works for both CPU and GPU arrays. + +The default backend for CPU arrays is currently `PolyesterBackend()`. +For GPU arrays, the respective `KernelAbstractions.Backend` is returned. +""" +@inline default_backend(::AbstractArray) = PolyesterBackend() +@inline default_backend(x::AbstractGPUArray) = KernelAbstractions.get_backend(x) +@inline default_backend(x::DynamicVectorOfVectors) = default_backend(x.backend) + +""" + @threaded backend for ... end + +Run either a threaded CPU loop or launch a kernel on the GPU, depending on the `backend`. Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitRange` but without guarantee that the underlying implementation uses `Threads.@threads` or works for more general `for` loops. -The first argument must either be a parallelization backend (see below) or an array from -which the backend can be derived to determine if the loop must be run threaded on the CPU -or launched as a kernel on the GPU. Passing `KernelAbstractions.CPU()` will run the GPU -kernel on the CPU. - Possible parallelization backends are: +- [`SerialBackend`](@ref) to disable multithreading - [`PolyesterBackend`](@ref) to use `Polyester.@batch` - [`ThreadsDynamicBackend`](@ref) to use `Threads.@threads :dynamic` - [`ThreadsStaticBackend`](@ref) to use `Threads.@threads :static` -- `KernelAbstractions.Backend` to execute the loop as a GPU kernel +- Any `KernelAbstractions.Backend` to execute the loop as a GPU kernel -In particular, the underlying threading capabilities might be provided by other packages -such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl). +Use `default_backend(x)` to select the recommended backend for an array `x`. +This allows to write generic code that works for both CPU and GPU arrays. !!! warning "Warning" This macro does not necessarily work for general `for` loops. For example, it does not necessarily support general iterables such as `eachline(filename)`. """ -macro threaded(system, expr) +macro threaded(backend, expr) # Reverse-engineer the for loop. # `expr.args[1]` is the head of the for loop, like `i = eachindex(x)`. # So, `expr.args[1].args[2]` is the iterator `eachindex(x)` @@ -100,37 +116,42 @@ macro threaded(system, expr) # Assemble the `for` loop again as a call to `parallel_foreach`, using `$i` to use the # same loop variable as used in the for loop. return esc(quote - PointNeighbors.parallel_foreach($iterator, $system) do $i + PointNeighbors.parallel_foreach($iterator, $backend) do $i $inner_loop end end) end -# Use `Polyester.@batch` for low-overhead threading -# This is currently the default when x::Array -@inline function parallel_foreach(f, iterator, x) +# Serial loop +@inline function parallel_foreach(f, iterator, ::SerialBackend) + for i in iterator + @inline f(i) + end +end + +# Use `Polyester.@batch` +@inline function parallel_foreach(f, iterator, ::PolyesterBackend) Polyester.@batch for i in iterator @inline f(i) end end # Use `Threads.@threads :dynamic` -@inline function parallel_foreach(f, iterator, x::ThreadsDynamicBackend) +@inline function parallel_foreach(f, iterator, ::ThreadsDynamicBackend) Threads.@threads :dynamic for i in iterator @inline f(i) end end # Use `Threads.@threads :static` -@inline function parallel_foreach(f, iterator, x::ThreadsStaticBackend) +@inline function parallel_foreach(f, iterator, ::ThreadsStaticBackend) Threads.@threads :static for i in iterator @inline f(i) end end # On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl -@inline function parallel_foreach(f, iterator, - x::Union{AbstractGPUArray, KernelAbstractions.Backend}) +@inline function parallel_foreach(f, iterator, backend::KernelAbstractions.Backend) # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` # and index with `iterator[eachindex(iterator)[i]]`. # Note that this only works with vector-like iterators that support arbitrary indexing. @@ -140,8 +161,6 @@ end # Skip empty loops ndrange == 0 && return - backend = KernelAbstractions.get_backend(x) - # Call the generic kernel that is defined below, which only calls a function with # the global GPU index. generic_kernel(backend)(ndrange = ndrange) do i diff --git a/test/cell_lists/full_grid.jl b/test/cell_lists/full_grid.jl index d4a81b55..9cc360e9 100644 --- a/test/cell_lists/full_grid.jl +++ b/test/cell_lists/full_grid.jl @@ -15,19 +15,18 @@ nhs = GridNeighborhoodSearch{N}(; cell_list, search_radius) y = rand(N, 10) error_string = "particle coordinates are NaN or outside the domain bounds of the cell list" - error = ErrorException(error_string) y[1, 7] = NaN - @test_throws error initialize!(nhs, y, y) + @test_throws error_string initialize!(nhs, y, y) y[1, 7] = min_corner[1] - 0.01 - @test_throws error initialize!(nhs, y, y) + @test_throws error_string initialize!(nhs, y, y) # A bit more than max corner might still be inside the grid, # but one search radius more is always outside. # Also accounting for 0.001 extra padding (see above). y[1, 7] = max_corner[1] + 1.01 - @test_throws error initialize!(nhs, y, y) + @test_throws error_string initialize!(nhs, y, y) y[1, 7] = 0.0 @test_nowarn_mod initialize!(nhs, y, y) @@ -37,16 +36,16 @@ @test_nowarn_mod update!(nhs, y, y) y[1, 7] = NaN - @test_throws error update!(nhs, y, y) + @test_throws error_string update!(nhs, y, y) # A bit more than max corner might still be inside the grid, # but one search radius more is always outside. # Also accounting for 0.001 extra padding (see above). y[1, 7] = max_corner[1] + 1.01 - @test_throws error update!(nhs, y, y) + @test_throws error_string update!(nhs, y, y) y[1, 7] = min_corner[1] - 0.01 - @test_throws error update!(nhs, y, y) + @test_throws error_string update!(nhs, y, y) end end end diff --git a/test/neighborhood_search.jl b/test/neighborhood_search.jl index e01efa9b..33bd72e4 100644 --- a/test/neighborhood_search.jl +++ b/test/neighborhood_search.jl @@ -145,8 +145,10 @@ neighbors_expected = [Int[] for _ in axes(coords, 2)] foreach_point_neighbor(coords, coords, trivial_nhs, - parallel = false) do point, neighbor, - pos_diff, distance + parallelization_backend = SerialBackend()) do point, + neighbor, + pos_diff, + distance append!(neighbors_expected[point], neighbor) end @@ -236,8 +238,10 @@ neighbors = [Int[] for _ in axes(coords, 2)] foreach_point_neighbor(coords, coords, nhs, - parallel = false) do point, neighbor, - pos_diff, distance + parallelization_backend = SerialBackend()) do point, + neighbor, + pos_diff, + distance append!(neighbors[point], neighbor) end diff --git a/test/nhs_grid.jl b/test/nhs_grid.jl index e7e3807a..5bd2aaf3 100644 --- a/test/nhs_grid.jl +++ b/test/nhs_grid.jl @@ -167,8 +167,7 @@ coordinates2 = coordinates1 .+ [1.4, -3.5, 0.8] # Update neighborhood search - coords_fun2(i) = coordinates2[:, i] - update_grid!(nhs1, coords_fun2) + update_grid!(nhs1, coordinates2) # Get each neighbor for updated NHS neighbors2 = sort(collect(PointNeighbors.eachneighbor(point_position1, nhs1)))