diff --git a/benchmarks/run_benchmarks.jl b/benchmarks/run_benchmarks.jl index ffabc805..ff08ebe3 100644 --- a/benchmarks/run_benchmarks.jl +++ b/benchmarks/run_benchmarks.jl @@ -50,10 +50,11 @@ run_benchmark(benchmark_count_neighbors, (10, 10), 3, ``` """ function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborhood_searches; + search_radius_factor = 3.0, parallelization_backend = PolyesterBackend(), names = ["Neighborhood search $i" for i in 1:length(neighborhood_searches)]', - seed = 1, perturbation_factor_position = 1.0) + seed = 1, perturbation_factor_position = 1.0, shuffle = false) # Multiply number of points in each iteration (roughly) by this factor scaling_factor = 4 per_dimension_factor = scaling_factor^(1 / length(n_points_per_dimension)) @@ -64,24 +65,29 @@ function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborho times = zeros(iterations, length(neighborhood_searches)) for iter in 1:iterations - coordinates = point_cloud(sizes[iter]; seed, perturbation_factor_position) + coordinates_ = point_cloud(sizes[iter], search_radius_factor; + seed, perturbation_factor_position, shuffle) + coordinates = convert.(typeof(search_radius_factor), coordinates_) domain_size = maximum(sizes[iter]) + 1 # Normalize domain size to 1 coordinates ./= domain_size # Make this Float32 to make sure that Float32 benchmarks use Float32 exclusively - search_radius = 4.0f0 / domain_size + search_radius = search_radius_factor / domain_size n_particles = size(coordinates, 2) neighborhood_searches_copy = copy_neighborhood_search.(neighborhood_searches, search_radius, n_particles) for i in eachindex(neighborhood_searches_copy) - neighborhood_search = neighborhood_searches_copy[i] - PointNeighbors.initialize!(neighborhood_search, coordinates, coordinates) + neighborhood_search_ = neighborhood_searches_copy[i] + neighborhood_search = PointNeighbors.Adapt.adapt(parallelization_backend, + neighborhood_search_) + coords = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates) + PointNeighbors.initialize!(neighborhood_search, coords, coords) - time = benchmark(neighborhood_search, coordinates; parallelization_backend) + time = benchmark(neighborhood_search, coords; parallelization_backend) times[iter, i] = time time_string = BenchmarkTools.prettytime(time * 1e9) time_string_per_particle = BenchmarkTools.prettytime(time * 1e9 / n_particles) @@ -170,23 +176,74 @@ include("benchmarks/benchmarks.jl") run_benchmark_gpu(benchmark_n_body, (10, 10), 3) ``` """ -function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; kwargs...) +function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; + parallelization_backend=PolyesterBackend(), kwargs...) NDIMS = length(n_points_per_dimension) min_corner = 0.0f0 .* n_points_per_dimension max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension)) - neighborhood_searches = [GridNeighborhoodSearch{NDIMS}(search_radius = 0.0f0, - cell_list = FullGridCellList(; - search_radius = 0.0f0, - min_corner, - max_corner)) - PrecomputedNeighborhoodSearch{NDIMS}(search_radius = 0.0f0)] - - names = ["GridNeighborhoodSearch with FullGridCellList";; - "PrecomputedNeighborhoodSearch"] + cell_list = FullGridCellList(; search_radius = 0.0f0, min_corner, max_corner) + grid_nhs = GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, cell_list, + update_strategy = ParallelUpdate()) + transpose_backend = parallelization_backend isa PointNeighbors.KernelAbstractions.GPU + neighborhood_searches = [ + grid_nhs + PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, + update_neighborhood_search = grid_nhs, + transpose_backend)#, max_neighbors=128) + ] + + names = [ + "GridNeighborhoodSearch with FullGridCellList";; + "PrecomputedNeighborhoodSearch" + ] run_benchmark(benchmark, n_points_per_dimension, iterations, - neighborhood_searches; names, kwargs...) + neighborhood_searches; names, parallelization_backend, kwargs...) +end + +""" + run_benchmark_full_grid(benchmark, n_points_per_dimension, iterations; kwargs...) + +Shortcut to call [`run_benchmark`](@ref) with a `GridNeighborhoodSearch` with a +`FullGridCellList`. This is the neighborhood search implementation that is used +in TrixiParticles.jl when performance is important. +Use this function to benchmark and profile TrixiParticles.jl kernels. + +# Arguments +- `benchmark`: The benchmark function. See [`benchmark_count_neighbors`](@ref), + [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref), + [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref). +- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number + of points. For example, use `(100, 100)` for a 2D benchmark or + `(10, 10, 10)` for a 3D benchmark. +- `iterations`: Number of refinement iterations + +# Keywords +See [`run_benchmark`](@ref) for a list of available keywords. + +# Examples +```julia +include("benchmarks/benchmarks.jl") + +run_benchmark_full_grid(benchmark_n_body, (10, 10), 3) +``` +""" +function run_benchmark_full_grid(benchmark, n_points_per_dimension, iterations; + parallelization_backend=PolyesterBackend(), kwargs...) + NDIMS = length(n_points_per_dimension) + + min_corner = 0.0f0 .* n_points_per_dimension + max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension)) + cell_list = FullGridCellList(; search_radius = 0.0f0, min_corner, max_corner) + grid_nhs = GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, cell_list, + update_strategy = ParallelUpdate()) + neighborhood_searches = [grid_nhs] + + names = ["GridNeighborhoodSearch with FullGridCellList";;] + + run_benchmark(benchmark, n_points_per_dimension, iterations, + neighborhood_searches; names, parallelization_backend, kwargs...) end """ diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl index 7286baca..88e38508 100644 --- a/benchmarks/smoothed_particle_hydrodynamics.jl +++ b/benchmarks/smoothed_particle_hydrodynamics.jl @@ -1,4 +1,5 @@ using PointNeighbors +using PointNeighbors.Adapt using TrixiParticles using BenchmarkTools @@ -43,47 +44,30 @@ This method is used to simulate an incompressible fluid. """ function benchmark_wcsph(neighborhood_search, coordinates; parallelization_backend = default_backend(coordinates)) - density = 1000.0 - particle_spacing = PointNeighbors.search_radius(neighborhood_search) / 3 - fluid = InitialCondition(; coordinates, density, mass = 0.1, particle_spacing) - - sound_speed = 10.0 - state_equation = StateEquationCole(; sound_speed, reference_density = density, - exponent = 1) - - viscosity = ArtificialViscosityMonaghan(alpha = 0.02, beta = 0.0) - density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1) + # System initialization has to happen on the CPU + coordinates_cpu = PointNeighbors.Adapt.adapt(Array, coordinates) - __benchmark_wcsph_inner(neighborhood_search, fluid, state_equation, - viscosity, density_diffusion, parallelization_backend) -end - -""" - benchmark_wcsph_fp32(neighborhood_search, coordinates; - parallelization_backend = default_backend(coordinates)) - -Like [`benchmark_wcsph`](@ref), but using single precision floating point numbers. -""" -function benchmark_wcsph_fp32(neighborhood_search, coordinates_; - parallelization_backend = default_backend(coordinates_)) - coordinates = convert(Matrix{Float32}, coordinates_) - density = 1000.0f0 + search_radius = PointNeighbors.search_radius(neighborhood_search) + ELTYPE = typeof(search_radius) + density = convert(ELTYPE, 1000.0) particle_spacing = PointNeighbors.search_radius(neighborhood_search) / 3 - fluid = InitialCondition(; coordinates, density, mass = 0.1f0, particle_spacing) + fluid = InitialCondition(; coordinates = coordinates_cpu, density, + mass = convert(ELTYPE, 0.1) * particle_spacing, + particle_spacing) - sound_speed = 10.0f0 + # Make sure that the computed forces are not all zero + for i in eachindex(fluid.density) + fluid.density[i] += rand(eltype(fluid.density)) + end + + sound_speed = convert(ELTYPE, 10.0) state_equation = StateEquationCole(; sound_speed, reference_density = density, exponent = 1) - viscosity = ArtificialViscosityMonaghan(alpha = 0.02f0, beta = 0.0f0) - density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1f0) + viscosity = ArtificialViscosityMonaghan(alpha = convert(ELTYPE, 0.02), + beta = convert(ELTYPE, 0.0)) + density_diffusion = DensityDiffusionMolteniColagrossi(delta = convert(ELTYPE, 0.1)) - __benchmark_wcsph_inner(neighborhood_search, fluid, state_equation, - viscosity, density_diffusion, parallelization_backend) -end - -function __benchmark_wcsph_inner(neighborhood_search, initial_condition, state_equation, - viscosity, density_diffusion, parallelization_backend) # Compact support == 2 * smoothing length for these kernels smoothing_length = PointNeighbors.search_radius(neighborhood_search) / 2 if ndims(neighborhood_search) == 1 @@ -92,23 +76,21 @@ function __benchmark_wcsph_inner(neighborhood_search, initial_condition, state_e smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}() end - fluid_system = WeaklyCompressibleSPHSystem(initial_condition, ContinuityDensity(), + fluid_system = WeaklyCompressibleSPHSystem(fluid, ContinuityDensity(), state_equation, smoothing_kernel, smoothing_length, viscosity = viscosity, density_diffusion = density_diffusion) - system = PointNeighbors.Adapt.adapt(parallelization_backend, fluid_system) + system = Adapt.adapt(parallelization_backend, fluid_system) # Remove unnecessary data structures that are only used for initialization - neighborhood_search_ = PointNeighbors.freeze_neighborhood_search(neighborhood_search) + nhs = PointNeighbors.freeze_neighborhood_search(neighborhood_search) - nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search_) semi = DummySemidiscretization(nhs, parallelization_backend, true) - v = PointNeighbors.Adapt.adapt(parallelization_backend, - vcat(initial_condition.velocity, - initial_condition.density')) - u = PointNeighbors.Adapt.adapt(parallelization_backend, initial_condition.coordinates) + v = Adapt.adapt(parallelization_backend, + vcat(fluid.velocity, fluid.density')) + u = Adapt.adapt(parallelization_backend, fluid.coordinates) dv = zero(v) # Initialize the system @@ -128,8 +110,15 @@ This method is used to simulate an elastic structure. """ function benchmark_tlsph(neighborhood_search, coordinates; parallelization_backend = default_backend(coordinates)) - material = (density = 1000.0, E = 1.4e6, nu = 0.4) - solid = InitialCondition(; coordinates, density = material.density, mass = 0.1) + # System initialization has to happen on the CPU + coordinates_cpu = PointNeighbors.Adapt.adapt(Array, coordinates) + + search_radius = PointNeighbors.search_radius(neighborhood_search) + ELTYPE = typeof(search_radius) + material = (density = convert(ELTYPE, 1000.0), E = convert(ELTYPE, 1.4e6), + nu = convert(ELTYPE, 0.4)) + solid = InitialCondition(; coordinates = coordinates_cpu, + density = material.density, mass = convert(ELTYPE, 0.1)) # Compact support == 2 * smoothing length for these kernels smoothing_length_ = PointNeighbors.search_radius(neighborhood_search) / 2 @@ -142,15 +131,20 @@ function benchmark_tlsph(neighborhood_search, coordinates; solid_system = TotalLagrangianSPHSystem(solid, smoothing_kernel, smoothing_length, material.E, material.nu) - semi = DummySemidiscretization(neighborhood_search, parallelization_backend, true) + system_ = Adapt.adapt(parallelization_backend, solid_system) - v = copy(solid.velocity) - u = copy(solid.coordinates) + # Remove unnecessary data structures that are only used for initialization + nhs = PointNeighbors.freeze_neighborhood_search(neighborhood_search) + system = TrixiParticles.@set system_.self_interaction_nhs = nhs + + semi = DummySemidiscretization(nhs, parallelization_backend, true) + + v = Adapt.adapt(parallelization_backend, copy(solid.velocity)) + u = Adapt.adapt(parallelization_backend, copy(solid.coordinates)) dv = zero(v) # Initialize the system - TrixiParticles.initialize!(solid_system, semi) + TrixiParticles.initialize!(system, semi) - return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, - $solid_system, $solid_system, $semi) + return @belapsed TrixiParticles.interact_structure_structure2!($dv, $v, $system, $semi) end diff --git a/test/neighborhood_search.jl b/test/neighborhood_search.jl index 65d3ed85..dd1bedea 100644 --- a/test/neighborhood_search.jl +++ b/test/neighborhood_search.jl @@ -142,14 +142,14 @@ "($(seed == 1 ? "`initialize!`" : "`update!`"))" @testset verbose=true "$(name(cloud_size, seed)))" for cloud_size in cloud_sizes, seed in seeds - coords = point_cloud(cloud_size, seed = seed) + search_radius = 2.5 + coords = point_cloud(cloud_size, search_radius, seed = seed) NDIMS = length(cloud_size) n_points = size(coords, 2) - search_radius = 2.5 # Use different coordinates for `initialize!` and then `update!` with the # correct coordinates to make sure that `update!` is working as well. - coords_initialize = point_cloud(cloud_size, seed = 1) + coords_initialize = point_cloud(cloud_size, search_radius, seed = 1) # Compute expected neighbor lists by brute-force looping over all points # as potential neighbors (`TrivialNeighborhoodSearch`). diff --git a/test/point_cloud.jl b/test/point_cloud.jl index ba99b2e1..64c41bec 100644 --- a/test/point_cloud.jl +++ b/test/point_cloud.jl @@ -1,8 +1,9 @@ using Random # Generate a rectangular point cloud, optionally with a perturbation in the point positions -function point_cloud(n_points_per_dimension; - seed = 1, perturbation_factor_position = 1.0) +function point_cloud(n_points_per_dimension, search_radius; + seed = 1, perturbation_factor_position = 1.0, + sort = true, shuffle = false) # Fixed seed to ensure reproducibility Random.seed!(seed) @@ -10,8 +11,15 @@ function point_cloud(n_points_per_dimension; coordinates = Array{Float64}(undef, n_dims, prod(n_points_per_dimension)) cartesian_indices = CartesianIndices(n_points_per_dimension) + # Extra data structures for the sorting code below + cell_coords = Vector{SVector{n_dims, Int}}(undef, size(coordinates, 2)) + cell_size = ntuple(dim -> search_radius, n_dims) + for i in axes(coordinates, 2) - coordinates[:, i] .= Tuple(cartesian_indices[i]) + point_coords = SVector(Tuple(cartesian_indices[i])) + coordinates[:, i] .= point_coords + cell_coords[i] = PointNeighbors.cell_coords(point_coords, nothing, nothing, + cell_size) .+ 1 end # A standard deviation of 0.05 in the particle coordinates @@ -21,6 +29,23 @@ function point_cloud(n_points_per_dimension; # The benchmark results are also consistent with the timer output of the simulation. perturb!(coordinates, perturbation_factor_position * 0.05) + # Sort by linear cell index + if sort + if shuffle + throw(ArgumentError("cannot sort and shuffle at the same time")) + end + + # Sort by Z index (with `using Morton`) + # permutation = sortperm(cell_coords, by = c -> cartesian2morton(c)) + + permutation = sortperm(cell_coords) + coordinates .= coordinates[:, permutation] + elseif shuffle + # Sort randomly + permutation = shuffle(axes(coordinates, 2)) + coordinates .= coordinates[:, permutation] + end + return coordinates end