PointNeighbors.jl/benchmarks/run_benchmarks.jl at ed26ee0d63d60c6a1f68f5d2c10155a1531b1d7f · trixi-framework/PointNeighbors.jl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
using Plots
using BenchmarkTools

# Generate a rectangular point cloud
include("../test/point_cloud.jl")

"""
    run_benchmarks(benchmark, n_points_per_dimension, iterations, neighborhood_searches;
                   parallelization_backend = PolyesterBackend(),
                   names = ["NeighborhoodSearch 1" "NeighborhoodSearch 2" ...],
                   seed = 1, perturbation_factor_position = 1.0)

Run a benchmark with several neighborhood searches multiple times for increasing numbers
of points and return the results as `(n_particles_vec, times)`, where `n_particles_vec`
is a vector containing the number of particles for each iteration and `times` is a matrix
containing the runtimes for each neighborhood search and iteration.

See also
- [`plot_benchmark`](@ref) to plot the results,
- [`run_benchmark_default`](@ref) to run the benchmark with the most commonly used
  neighborhood search implementations,
- [`run_benchmark_gpu`](@ref) to run the benchmark with all GPU-compatible neighborhood
  search implementations.

# Arguments
- `benchmark`:              The benchmark function. See [`benchmark_count_neighbors`](@ref),
                            [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref),
                            [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref).
- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number
                            of points. For example, use `(100, 100)` for a 2D benchmark or
                            `(10, 10, 10)` for a 3D benchmark.
- `iterations`:             Number of refinement iterations

# Keywords
- `parallelization_backend = PolyesterBackend()`: Parallelization strategy to use. See
                                                  [`@threaded`](@ref) for a list of available
                                                  backends.
- `seed = 1`:               Seed to perturb the point positions. Different seeds yield
                            slightly different point positions.
- `perturbation_factor_position = 1.0`: Scale the point position perturbation by this factor.
                                        A factor of `1.0` corresponds to a standard deviation
                                        similar to that of a realistic simulation.

# Examples
```julia
include("benchmarks/benchmarks.jl")

run_benchmark(benchmark_count_neighbors, (10, 10), 3,
              [TrivialNeighborhoodSearch{2}(), GridNeighborhoodSearch{2}()])
```
"""
function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborhood_searches;
                       search_radius_factor = 3.0,
                       parallelization_backend = PolyesterBackend(),
                       names = ["Neighborhood search $i"
                                for i in 1:length(neighborhood_searches)]',
                       seed = 1, perturbation_factor_position = 1.0, shuffle = false)
    # Multiply number of points in each iteration (roughly) by this factor
    scaling_factor = 4
    per_dimension_factor = scaling_factor^(1 / length(n_points_per_dimension))
    sizes = [round.(Int, n_points_per_dimension .* per_dimension_factor^(iter - 1))
             for iter in 1:iterations]

    n_particles_vec = prod.(sizes)
    times = zeros(iterations, length(neighborhood_searches))

    for iter in 1:iterations
        coordinates_ = point_cloud(sizes[iter], search_radius_factor;
                                   seed, perturbation_factor_position, shuffle)
        coordinates = convert.(typeof(search_radius_factor), coordinates_)
        domain_size = maximum(sizes[iter]) + 1

        # Normalize domain size to 1
        coordinates ./= domain_size

        # Make this Float32 to make sure that Float32 benchmarks use Float32 exclusively
        search_radius = search_radius_factor / domain_size
        n_particles = size(coordinates, 2)

        neighborhood_searches_copy = copy_neighborhood_search.(neighborhood_searches,
                                                               search_radius, n_particles)

        for i in eachindex(neighborhood_searches_copy)
            neighborhood_search_ = neighborhood_searches_copy[i]
            neighborhood_search = PointNeighbors.Adapt.adapt(parallelization_backend,
                                                             neighborhood_search_)
            coords = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates)
            PointNeighbors.initialize!(neighborhood_search, coords, coords)

            time = benchmark(neighborhood_search, coords; parallelization_backend)
            times[iter, i] = time
            time_string = BenchmarkTools.prettytime(time * 1e9)
            time_string_per_particle = BenchmarkTools.prettytime(time * 1e9 / n_particles)
            println("$(names[i])")
            println("with $(join(sizes[iter], "x")) = $(prod(sizes[iter])) particles " *
                    "finished in $time_string ($time_string_per_particle per particle)\n")
        end
    end

    return n_particles_vec, times
end

"""
    run_benchmark_default(benchmark, n_points_per_dimension, iterations; kwargs...)

Shortcut to call [`run_benchmark`](@ref) with the most commonly used neighborhood search
implementations:
- `GridNeighborhoodSearch`
- `GridNeighborhoodSearch` with `FullGridCellList`
- `PrecomputedNeighborhoodSearch`

# Arguments
- `benchmark`:              The benchmark function. See [`benchmark_count_neighbors`](@ref),
                            [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref),
                            [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref).
- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number
                            of points. For example, use `(100, 100)` for a 2D benchmark or
                            `(10, 10, 10)` for a 3D benchmark.
- `iterations`:             Number of refinement iterations

# Keywords
See [`run_benchmark`](@ref) for a list of available keywords.

# Examples
```julia
include("benchmarks/benchmarks.jl")

run_benchmark_default(benchmark_n_body, (10, 10), 3)
```
"""
function run_benchmark_default(benchmark, n_points_per_dimension, iterations; kwargs...)
    NDIMS = length(n_points_per_dimension)
    min_corner = 0.0f0 .* n_points_per_dimension
    max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension))

    neighborhood_searches = [
        GridNeighborhoodSearch{NDIMS}(),
        GridNeighborhoodSearch{NDIMS}(search_radius = 0.0f0,
                                      cell_list = FullGridCellList(; search_radius = 0.0f0,
                                                                   min_corner, max_corner)),
        PrecomputedNeighborhoodSearch{NDIMS}()
    ]

    names = ["GridNeighborhoodSearch";;
             "GridNeighborhoodSearch with FullGridCellList";;
             "PrecomputedNeighborhoodSearch"]

    run_benchmark(benchmark, n_points_per_dimension, iterations,
                  neighborhood_searches; names, kwargs...)
end

"""
    run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; kwargs...)

Shortcut to call [`run_benchmark`](@ref) with all GPU-compatible neighborhood search
implementations:
- `GridNeighborhoodSearch` with `FullGridCellList`
- `PrecomputedNeighborhoodSearch`

# Arguments
- `benchmark`:              The benchmark function. See [`benchmark_count_neighbors`](@ref),
                            [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref),
                            [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref).
- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number
                            of points. For example, use `(100, 100)` for a 2D benchmark or
                            `(10, 10, 10)` for a 3D benchmark.
- `iterations`:             Number of refinement iterations

# Keywords
See [`run_benchmark`](@ref) for a list of available keywords.

# Examples
```julia
include("benchmarks/benchmarks.jl")

run_benchmark_gpu(benchmark_n_body, (10, 10), 3)
```
"""
function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations;
                           parallelization_backend=PolyesterBackend(), kwargs...)
    NDIMS = length(n_points_per_dimension)

    min_corner = 0.0f0 .* n_points_per_dimension
    max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension))
    cell_list = FullGridCellList(; search_radius = 0.0f0, min_corner, max_corner)
    grid_nhs = GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, cell_list,
                                             update_strategy = ParallelUpdate())
    transpose_backend = parallelization_backend isa PointNeighbors.KernelAbstractions.GPU
    neighborhood_searches = [
        grid_nhs
        PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0,
                                             update_neighborhood_search = grid_nhs,
                                             transpose_backend)#, max_neighbors=128)
    ]

    names = [
        "GridNeighborhoodSearch with FullGridCellList";;
        "PrecomputedNeighborhoodSearch"
    ]

    run_benchmark(benchmark, n_points_per_dimension, iterations,
                  neighborhood_searches; names, parallelization_backend, kwargs...)
end

"""
    run_benchmark_full_grid(benchmark, n_points_per_dimension, iterations; kwargs...)

Shortcut to call [`run_benchmark`](@ref) with a `GridNeighborhoodSearch` with a
`FullGridCellList`. This is the neighborhood search implementation that is used
in TrixiParticles.jl when performance is important.
Use this function to benchmark and profile TrixiParticles.jl kernels.

# Arguments
- `benchmark`:              The benchmark function. See [`benchmark_count_neighbors`](@ref),
                            [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref),
                            [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref).
- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number
                            of points. For example, use `(100, 100)` for a 2D benchmark or
                            `(10, 10, 10)` for a 3D benchmark.
- `iterations`:             Number of refinement iterations

# Keywords
See [`run_benchmark`](@ref) for a list of available keywords.

# Examples
```julia
include("benchmarks/benchmarks.jl")

run_benchmark_full_grid(benchmark_n_body, (10, 10), 3)
```
"""
function run_benchmark_full_grid(benchmark, n_points_per_dimension, iterations;
                           parallelization_backend=PolyesterBackend(), kwargs...)
    NDIMS = length(n_points_per_dimension)

    min_corner = 0.0f0 .* n_points_per_dimension
    max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension))
    cell_list = FullGridCellList(; search_radius = 0.0f0, min_corner, max_corner)
    grid_nhs = GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, cell_list,
                                             update_strategy = ParallelUpdate())
    neighborhood_searches = [grid_nhs]

    names = ["GridNeighborhoodSearch with FullGridCellList";;]

    run_benchmark(benchmark, n_points_per_dimension, iterations,
                  neighborhood_searches; names, parallelization_backend, kwargs...)
end

"""
    plot_benchmark(n_particles_vec, times; kwargs...)

Plot the results of a benchmark run with [`run_benchmark`](@ref).
Note that the arguments are the outputs of that function.

# Arguments
- `n_particles_vec`: Vector containing the number of particles for each iteration.
- `times`:           Matrix containing the runtimes for each neighborhood search and iteration.

# Keywords
Keyword arguments are passed to `Plots.plot`. For example, use `title = "My title"`.

# Examples
```julia
include("benchmarks/benchmarks.jl")

n_particles_vec, times = run_benchmark_default(benchmark_count_neighbors, (10, 10), 3)
plot_benchmark(n_particles_vec, times; title = "Count neighbors benchmark")
```
"""
function plot_benchmark(n_particles_vec, times; kwargs...)
    function format_n_particles(n)
        if n >= 1_000_000
            return "$(round(Int, n / 1_000_000))M"
        elseif n >= 1_000
            return "$(round(Int, n / 1_000))k"
        else
            return string(n)
        end
    end
    xticks = format_n_particles.(n_particles_vec)

    plot(n_particles_vec, times ./ n_particles_vec .* 1e9;
         xaxis = :log,
         xticks = (n_particles_vec, xticks), linewidth = 2,
         xlabel = "#particles", ylabel = "runtime per particle [ns]",
         legend = :outerright, size = (700, 350), dpi = 200, margin = 4 * Plots.mm,
         palette = palette(:tab10), kwargs...)
end