-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathnbody.cpp
438 lines (384 loc) · 15.7 KB
/
nbody.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
// Copyright 2022 Bernhard Manfred Gruber
// SPDX-License-Identifier: MPL-2.0
#include "../../common/Stats.hpp"
#include "../../common/Stopwatch.hpp"
#include "../../common/env.hpp"
#include <alpaka/alpaka.hpp>
#include <alpaka/example/ExampleDefaultAcc.hpp>
#include <fmt/format.h>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <llama/llama.hpp>
#include <random>
#include <string>
#include <utility>
#if __has_include(<xsimd/xsimd.hpp>)
# include <xsimd/xsimd.hpp>
# define HAVE_XSIMD
#endif
using FP = float;
constexpr auto problemSize = 64 * 1024; ///< total number of particles
constexpr auto steps = 20; ///< number of steps to calculate, excluding 1 warmup run
constexpr auto allowRsqrt = false; // rsqrt can be way faster, but less accurate (some compilers may insert an
// additional newton raphson refinement)
constexpr auto runUpdate = true; // run update step. Useful to disable for benchmarking the move step.
#if defined(ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED) || defined(ALPAKA_ACC_CPU_B_OMP2_T_SEQ_ENABLED) \
|| (defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_CPU))
# define ANY_CPU_ENABLED 1
#endif
#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) || defined(ALPAKA_ACC_GPU_HIP_ENABLED) \
|| (defined(ALPAKA_ACC_SYCL_ENABLED) && defined(ALPAKA_SYCL_ONEAPI_GPU))
# define ANY_GPU_ENABLED 1
#endif
#if ANY_CPU_ENABLED && ANY_GPU_ENABLED
# error Cannot enable CPU and GPU backends at the same time
#endif
#if ANY_CPU_ENABLED
constexpr auto threadsPerBlock = 1;
constexpr auto sharedElementsPerBlock = 1;
constexpr auto elementsPerThread = xsimd::batch<float>::size;
constexpr auto aosoaLanes = elementsPerThread;
#elif ANY_GPU_ENABLED
constexpr auto threadsPerBlock = 256;
constexpr auto sharedElementsPerBlock = 512;
constexpr auto elementsPerThread = 1;
constexpr auto aosoaLanes = 32; // coalesced memory access
#else
# error "Unsupported backend"
#endif
// makes our life easier for now
static_assert(problemSize % sharedElementsPerBlock == 0);
static_assert(sharedElementsPerBlock % threadsPerBlock == 0);
constexpr auto timestep = FP{0.0001};
constexpr auto eps2 = FP{0.01};
constexpr auto rngSeed = 42;
constexpr auto referenceParticleIndex = 1338;
#ifdef HAVE_XSIMD
template<typename Batch>
struct llama::SimdTraits<Batch, std::enable_if_t<xsimd::is_batch<Batch>::value>>
{
using value_type = typename Batch::value_type;
inline static constexpr std::size_t lanes = Batch::size;
static LLAMA_FORCE_INLINE auto loadUnaligned(const value_type* mem) -> Batch
{
return Batch::load_unaligned(mem);
}
static LLAMA_FORCE_INLINE void storeUnaligned(Batch batch, value_type* mem)
{
batch.store_unaligned(mem);
}
template<std::size_t... Is>
static LLAMA_FORCE_INLINE auto indicesToReg(std::array<int, lanes> indices, std::index_sequence<Is...>)
{
return xsimd::batch<int, typename Batch::arch_type>(indices[Is]...);
}
static LLAMA_FORCE_INLINE auto gather(const value_type* mem, std::array<int, lanes> indices) -> Batch
{
return Batch::gather(mem, indicesToReg(indices, std::make_index_sequence<lanes>{}));
}
static LLAMA_FORCE_INLINE void scatter(Batch batch, value_type* mem, std::array<int, lanes> indices)
{
batch.scatter(mem, indicesToReg(indices, std::make_index_sequence<lanes>{}));
}
};
template<typename T, std::size_t N>
struct MakeSizedBatchImpl
{
using type = xsimd::make_sized_batch_t<T, N>;
static_assert(!std::is_void_v<type>);
};
template<typename T, std::size_t N>
using MakeSizedBatch = typename MakeSizedBatchImpl<T, N>::type;
#endif
// clang-format off
namespace tag
{
struct Pos{};
struct Vel{};
struct X{};
struct Y{};
struct Z{};
struct Mass{};
struct Padding{};
} // namespace tag
using Vec3 = llama::Record<
llama::Field<tag::X, FP>,
llama::Field<tag::Y, FP>,
llama::Field<tag::Z, FP>>;
using Particle = llama::Record<
llama::Field<tag::Pos, Vec3>,
llama::Field<tag::Vel, Vec3>,
llama::Field<tag::Mass, FP>>;
using ParticleJ = llama::Record<
llama::Field<tag::Pos, Vec3>,
llama::Field<tag::Mass, FP>>;
// clang-format on
// using SharedMemoryParticle = Particle;
using SharedMemoryParticle = ParticleJ;
enum Mapping
{
AoS,
SoA_SB,
SoA_MB,
AoSoA,
SplitGpuGems
};
template<typename Acc, typename ParticleRefI, typename ParticleRefJ>
LLAMA_FN_HOST_ACC_INLINE void pPInteraction(const Acc& acc, ParticleRefI& pis, ParticleRefJ pj)
{
auto dist = pis(tag::Pos{}) - pj(tag::Pos{});
dist *= dist;
const auto distSqr = +eps2 + dist(tag::X{}) + dist(tag::Y{}) + dist(tag::Z{});
const auto distSixth = distSqr * distSqr * distSqr;
const auto invDistCube
= allowRsqrt ? alpaka::math::rsqrt(acc, distSixth) : (FP{1} / alpaka::math::sqrt(acc, distSixth));
const auto sts = (pj(tag::Mass{}) * timestep) * invDistCube;
pis(tag::Vel{}) += dist * sts;
}
template<int ThreadsPerBlock, int SharedElementsPerBlock, int ElementsPerThread, typename QuotedSMMapping>
struct UpdateKernel
{
ALPAKA_FN_HOST_ACC void operator()(const auto& acc, auto particles) const
{
auto sharedView = [&]
{
// if there is only 1 shared element per block, use just a variable (in registers) instead of shared memory
if constexpr(SharedElementsPerBlock == 1)
{
using Mapping = llama::mapping::MinAlignedOne<llama::ArrayExtents<int, 1>, SharedMemoryParticle>;
return allocViewUninitialized(Mapping{}, llama::bloballoc::Array<Mapping{}.blobSize(0)>{});
}
else
{
using Mapping = typename QuotedSMMapping::
template fn<llama::ArrayExtents<int, SharedElementsPerBlock>, SharedMemoryParticle>;
return [&]<std::size_t... Is>(std::index_sequence<Is...>)
{
return llama::View{
Mapping{},
llama::Array{alpaka::declareSharedVar<std::byte[Mapping{}.blobSize(Is)], Is>(acc)...}};
}(std::make_index_sequence<Mapping::blobCount>{});
}
}();
const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
const auto tbi = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0];
auto pis = llama::SimdN<Particle, ElementsPerThread, MakeSizedBatch>{};
llama::loadSimd(particles(ti * ElementsPerThread), pis);
for(int blockOffset = 0; blockOffset < problemSize; blockOffset += SharedElementsPerBlock)
{
for(int j = 0; j < SharedElementsPerBlock; j += ThreadsPerBlock)
sharedView(tbi + j) = particles(blockOffset + tbi + j);
alpaka::syncBlockThreads(acc);
for(int j = 0; j < SharedElementsPerBlock; ++j)
pPInteraction(acc, pis, sharedView(j));
alpaka::syncBlockThreads(acc);
}
llama::storeSimd(pis(tag::Vel{}), particles(ti * ElementsPerThread)(tag::Vel{}));
}
};
template<int ElementsPerThread>
struct MoveKernel
{
ALPAKA_FN_HOST_ACC void operator()(const auto& acc, auto particles) const
{
const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];
const auto i = ti * ElementsPerThread;
llama::SimdN<Vec3, ElementsPerThread, MakeSizedBatch> pos;
llama::SimdN<Vec3, ElementsPerThread, MakeSizedBatch> vel;
llama::loadSimd(particles(i)(tag::Pos{}), pos);
llama::loadSimd(particles(i)(tag::Vel{}), vel);
llama::storeSimd(pos + vel * +timestep, particles(i)(tag::Pos{}));
}
};
template<typename Acc>
constexpr auto hasSharedMem
= alpaka::accMatchesTags<Acc, alpaka::TagGpuCudaRt, alpaka::TagGpuHipRt, alpaka::TagGpuSyclIntel>;
template<typename Acc, Mapping MappingGM, Mapping MappingSM>
void run(std::ostream& plotFile)
{
using Dim = alpaka::Dim<Acc>;
using Size = alpaka::Idx<Acc>;
auto mappingName = [](int m) -> std::string
{
if(m == 0)
return "AoS";
if(m == 1)
return "SoA SB";
if(m == 2)
return "SoA MB";
if(m == 3)
return "AoSoA" + std::to_string(aosoaLanes);
if(m == 4)
return "SplitGpuGems";
std::abort();
};
const auto title = "GM " + mappingName(MappingGM) + (hasSharedMem<Acc> ? " SM " + mappingName(MappingSM) : "");
std::cout << '\n' << title << '\n';
const auto platformAcc = alpaka::Platform<Acc>{};
const auto platformHost = alpaka::PlatformCpu{};
const auto devAcc = alpaka::getDevByIdx(platformAcc, 0);
const auto devHost = alpaka::getDevByIdx(platformHost, 0);
auto queue = alpaka::Queue<Acc, alpaka::Blocking>{devAcc};
auto mapping = []
{
using ArrayExtents = llama::ArrayExtentsDynamic<int, 1>;
const auto extents = ArrayExtents{problemSize};
if constexpr(MappingGM == AoS)
return llama::mapping::AoS<ArrayExtents, Particle>{extents};
if constexpr(MappingGM == SoA_SB)
return llama::mapping::SoA<ArrayExtents, Particle, llama::mapping::Blobs::Single>{extents};
if constexpr(MappingGM == SoA_MB)
return llama::mapping::SoA<ArrayExtents, Particle, llama::mapping::Blobs::OnePerField>{extents};
if constexpr(MappingGM == AoSoA)
return llama::mapping::AoSoA<ArrayExtents, Particle, aosoaLanes>{extents};
using boost::mp11::mp_list;
if constexpr(MappingGM == SplitGpuGems)
{
using Vec4 = llama::Record<
llama::Field<tag::X, FP>,
llama::Field<tag::Y, FP>,
llama::Field<tag::Z, FP>,
llama::Field<tag::Padding, FP>>; // dummy
using ParticlePadded = llama::
Record<llama::Field<tag::Pos, Vec3>, llama::Field<tag::Vel, Vec4>, llama::Field<tag::Mass, FP>>;
return llama::mapping::Split<
ArrayExtents,
ParticlePadded,
mp_list<
mp_list<tag::Pos, tag::X>,
mp_list<tag::Pos, tag::Y>,
mp_list<tag::Pos, tag::Z>,
mp_list<tag::Mass>>,
llama::mapping::BindAoS<>::fn,
llama::mapping::BindAoS<>::fn,
true>{extents};
}
}();
[[maybe_unused]] auto selectedSMMapping = []
{
if constexpr(MappingSM == AoS)
return llama::mapping::BindAoS{};
if constexpr(MappingSM == SoA_SB)
return llama::mapping::BindSoA<llama::mapping::Blobs::Single>{};
if constexpr(MappingSM == SoA_MB)
return llama::mapping::BindSoA<llama::mapping::Blobs::OnePerField>{};
if constexpr(MappingSM == AoSoA)
return llama::mapping::BindAoSoA<aosoaLanes>{};
}();
using QuotedMappingSM = decltype(selectedSMMapping);
std::ofstream{"nbody_alpaka_mapping_" + mappingName(MappingGM) + ".svg"}
<< llama::toSvg(decltype(mapping){llama::ArrayExtentsDynamic<int, 1>{32}});
Stopwatch watch;
auto hostView
= llama::allocViewUninitialized(mapping, llama::bloballoc::AlpakaBuf<Size, decltype(devHost)>{devHost});
auto accView = llama::allocViewUninitialized(mapping, llama::bloballoc::AlpakaBuf<Size, decltype(devAcc)>{devAcc});
watch.printAndReset("alloc views");
std::mt19937_64 engine{rngSeed};
std::normal_distribution distribution(FP{0}, FP{1});
for(int i = 0; i < problemSize; ++i)
{
llama::One<Particle> p;
p(tag::Pos(), tag::X()) = distribution(engine);
p(tag::Pos(), tag::Y()) = distribution(engine);
p(tag::Pos(), tag::Z()) = distribution(engine);
p(tag::Vel(), tag::X()) = distribution(engine) / FP{10};
p(tag::Vel(), tag::Y()) = distribution(engine) / FP{10};
p(tag::Vel(), tag::Z()) = distribution(engine) / FP{10};
p(tag::Mass()) = distribution(engine) / FP{100};
hostView(i) = p;
}
watch.printAndReset("init");
for(std::size_t i = 0; i < mapping.blobCount; i++)
alpaka::memcpy(queue, accView.blobs()[i], hostView.blobs()[i]);
watch.printAndReset("copy H->D");
const auto workdiv = alpaka::WorkDivMembers<Dim, Size>{
alpaka::Vec<Dim, Size>{static_cast<Size>(problemSize / (threadsPerBlock * elementsPerThread))},
alpaka::Vec<Dim, Size>{static_cast<Size>(threadsPerBlock)},
alpaka::Vec<Dim, Size>{static_cast<Size>(elementsPerThread)}};
common::Stats statsUpdate;
common::Stats statsMove;
for(int s = 0; s < steps + 1; ++s)
{
if constexpr(runUpdate)
{
auto updateKernel
= UpdateKernel<threadsPerBlock, sharedElementsPerBlock, elementsPerThread, QuotedMappingSM>{};
alpaka::exec<Acc>(queue, workdiv, updateKernel, llama::shallowCopy(accView));
statsUpdate(watch.printAndReset("update", '\t'));
}
auto moveKernel = MoveKernel<elementsPerThread>{};
alpaka::exec<Acc>(queue, workdiv, moveKernel, llama::shallowCopy(accView));
statsMove(watch.printAndReset("move"));
}
plotFile << std::quoted(title) << "\t" << statsUpdate.mean() << "\t" << statsUpdate.sem() << '\t'
<< statsMove.mean() << "\t" << statsMove.sem() << '\n';
for(std::size_t i = 0; i < mapping.blobCount; i++)
alpaka::memcpy(queue, hostView.blobs()[i], accView.blobs()[i]);
watch.printAndReset("copy D->H");
const auto [x, y, z] = hostView(referenceParticleIndex)(tag::Pos{});
fmt::print("reference pos: {{{} {} {}}}\n", x, y, z);
}
auto main() -> int
try
{
using Dim = alpaka::DimInt<1>;
using Size = int;
using Acc = alpaka::ExampleDefaultAcc<Dim, Size>;
const auto env = common::captureEnv<Acc>();
std::cout << problemSize / 1000 << "k particles (" << problemSize * llama::sizeOf<Particle> / 1024 << "kiB)\n"
<< "Caching " << threadsPerBlock << " particles (" << threadsPerBlock * llama::sizeOf<Particle> / 1024
<< " kiB) in shared memory\n"
<< "Reducing on " << elementsPerThread << " particles per thread\n"
<< "Using " << threadsPerBlock << " threads per block\n"
<< env << '\n';
std::cout << std::fixed;
std::ofstream plotFile{"nbody_alpaka.sh"};
plotFile.exceptions(std::ios::badbit | std::ios::failbit);
plotFile << fmt::format(
R"(#!/usr/bin/gnuplot -p
# {}
set title "nbody alpaka {}ki particles on {}"
set style data histograms
set style histogram errorbars
set style fill solid border -1
set xtics rotate by 45 right nomirror
set key out top center maxrows 3
set yrange [0:*]
set y2range [0:*]
set ylabel "update runtime [s]"
set y2label "move runtime [s]"
set y2tics auto
$data << EOD
"" "update" "update_sem" "move" "move_sem"
)",
env,
problemSize / 1024,
alpaka::getAccName<Acc>());
constexpr auto runSMVariations = hasSharedMem<Acc> && runUpdate;
run<Acc, AoS, AoS>(plotFile);
if constexpr(runSMVariations)
run<Acc, AoS, SoA_SB>(plotFile);
if constexpr(runSMVariations)
run<Acc, AoS, AoSoA>(plotFile);
run<Acc, SoA_MB, AoS>(plotFile);
if constexpr(runSMVariations)
run<Acc, SoA_MB, SoA_SB>(plotFile);
if constexpr(runSMVariations)
run<Acc, SoA_MB, AoSoA>(plotFile);
run<Acc, AoSoA, AoS>(plotFile);
if constexpr(runSMVariations)
run<Acc, AoSoA, SoA_SB>(plotFile);
if constexpr(runSMVariations)
run<Acc, AoSoA, AoSoA>(plotFile);
run<Acc, SplitGpuGems, AoS>(plotFile);
plotFile << R"(EOD
plot $data using 2:3:xtic(1) ti col axis x1y1, "" using 4:5 ti col axis x1y2
)";
std::cout << "Plot with: ./nbody_alpaka.sh\n";
return 0;
}
catch(const std::exception& e)
{
std::cerr << "Exception: " << e.what() << '\n';
}