@@ -43,9 +43,9 @@ constexpr auto runUpdate = true; // run update step. Useful to disable for bench
43
43
#endif
44
44
45
45
#if ANY_CPU_ENABLED
46
- constexpr auto elementsPerThread = xsimd::batch<float >::size;
47
46
constexpr auto threadsPerBlock = 1 ;
48
47
constexpr auto sharedElementsPerBlock = 1 ;
48
+ constexpr auto elementsPerThread = xsimd::batch<float >::size;
49
49
constexpr auto aosoaLanes = elementsPerThread;
50
50
#elif ANY_GPU_ENABLED
51
51
constexpr auto threadsPerBlock = 256 ;
@@ -101,9 +101,6 @@ struct llama::SimdTraits<Batch, std::enable_if_t<xsimd::is_batch<Batch>::value>>
101
101
}
102
102
};
103
103
104
- template <typename T>
105
- using MakeBatch = xsimd::batch<T>;
106
-
107
104
template <typename T, std::size_t N>
108
105
struct MakeSizedBatchImpl
109
106
{
@@ -166,66 +163,60 @@ LLAMA_FN_HOST_ACC_INLINE void pPInteraction(const Acc& acc, ParticleRefI& pis, P
166
163
pis (tag::Vel{}) += dist * sts;
167
164
}
168
165
169
- template <int Elems , typename QuotedSMMapping>
166
+ template <int ThreadsPerBlock, int SharedElementsPerBlock, int ElementsPerThread , typename QuotedSMMapping>
170
167
struct UpdateKernel
171
168
{
172
- template <typename Acc, typename View>
173
- ALPAKA_FN_HOST_ACC void operator ()(const Acc& acc, View particles) const
169
+ ALPAKA_FN_HOST_ACC void operator ()(const auto & acc, auto particles) const
174
170
{
175
171
auto sharedView = [&]
176
172
{
177
173
// if there is only 1 shared element per block, use just a variable (in registers) instead of shared memory
178
- if constexpr (sharedElementsPerBlock == 1 )
174
+ if constexpr (SharedElementsPerBlock == 1 )
179
175
{
180
176
using Mapping = llama::mapping::MinAlignedOne<llama::ArrayExtents<int , 1 >, SharedMemoryParticle>;
181
177
return allocViewUninitialized (Mapping{}, llama::bloballoc::Array<Mapping{}.blobSize (0 )>{});
182
178
}
183
179
else
184
180
{
185
- using ArrayExtents = llama::ArrayExtents<int , sharedElementsPerBlock>;
186
- using Mapping = typename QuotedSMMapping::template fn<ArrayExtents, SharedMemoryParticle>;
187
- constexpr auto sharedMapping = Mapping{};
188
-
189
- llama::Array<std::byte*, Mapping::blobCount> sharedMems{};
190
- boost::mp11::mp_for_each<boost::mp11::mp_iota_c<Mapping::blobCount>>(
191
- [&](auto i)
192
- {
193
- auto & sharedMem = alpaka::declareSharedVar<std::byte[sharedMapping.blobSize (i)], i>(acc);
194
- sharedMems[i] = &sharedMem[0 ];
195
- });
196
- return llama::View{sharedMapping, sharedMems};
181
+ using Mapping = typename QuotedSMMapping::
182
+ template fn<llama::ArrayExtents<int , SharedElementsPerBlock>, SharedMemoryParticle>;
183
+ return [&]<std::size_t ... Is>(std::index_sequence<Is...>)
184
+ {
185
+ return llama::View{
186
+ Mapping{},
187
+ llama::Array{alpaka::declareSharedVar<std::byte[Mapping{}.blobSize (Is)], Is>(acc)...}};
188
+ }(std::make_index_sequence<Mapping::blobCount>{});
197
189
}
198
190
}();
199
191
200
192
const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0 ];
201
193
const auto tbi = alpaka::getIdx<alpaka::Block, alpaka::Threads>(acc)[0 ];
202
194
203
- auto pis = llama::SimdN<typename View::RecordDim, Elems , MakeSizedBatch>{};
204
- llama::loadSimd (particles (ti * Elems ), pis);
195
+ auto pis = llama::SimdN<Particle, ElementsPerThread , MakeSizedBatch>{};
196
+ llama::loadSimd (particles (ti * ElementsPerThread ), pis);
205
197
206
- for (int blockOffset = 0 ; blockOffset < problemSize; blockOffset += sharedElementsPerBlock )
198
+ for (int blockOffset = 0 ; blockOffset < problemSize; blockOffset += SharedElementsPerBlock )
207
199
{
208
- for (int j = 0 ; j < sharedElementsPerBlock ; j += threadsPerBlock )
200
+ for (int j = 0 ; j < SharedElementsPerBlock ; j += ThreadsPerBlock )
209
201
sharedView (j) = particles (blockOffset + tbi + j);
210
202
alpaka::syncBlockThreads (acc);
211
- for (int j = 0 ; j < sharedElementsPerBlock ; ++j)
203
+ for (int j = 0 ; j < SharedElementsPerBlock ; ++j)
212
204
pPInteraction (acc, pis, sharedView (j));
213
205
alpaka::syncBlockThreads (acc);
214
206
}
215
- llama::storeSimd (pis (tag::Vel{}), particles (ti * Elems )(tag::Vel{}));
207
+ llama::storeSimd (pis (tag::Vel{}), particles (ti * ElementsPerThread )(tag::Vel{}));
216
208
}
217
209
};
218
210
219
- template <int Elems >
211
+ template <int ElementsPerThread >
220
212
struct MoveKernel
221
213
{
222
- template <typename Acc, typename View>
223
- ALPAKA_FN_HOST_ACC void operator ()(const Acc& acc, View particles) const
214
+ ALPAKA_FN_HOST_ACC void operator ()(const auto & acc, auto particles) const
224
215
{
225
216
const auto ti = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0 ];
226
- const auto i = ti * Elems ;
227
- llama::SimdN<Vec3, Elems , MakeSizedBatch> pos;
228
- llama::SimdN<Vec3, Elems , MakeSizedBatch> vel;
217
+ const auto i = ti * ElementsPerThread ;
218
+ llama::SimdN<Vec3, ElementsPerThread , MakeSizedBatch> pos;
219
+ llama::SimdN<Vec3, ElementsPerThread , MakeSizedBatch> vel;
229
220
llama::loadSimd (particles (i)(tag::Pos{}), pos);
230
221
llama::loadSimd (particles (i)(tag::Vel{}), vel);
231
222
llama::storeSimd (pos + vel * +timestep, particles (i)(tag::Pos{}));
@@ -354,7 +345,8 @@ void run(std::ostream& plotFile)
354
345
{
355
346
if constexpr (runUpdate)
356
347
{
357
- auto updateKernel = UpdateKernel<elementsPerThread, QuotedMappingSM>{};
348
+ auto updateKernel
349
+ = UpdateKernel<threadsPerBlock, sharedElementsPerBlock, elementsPerThread, QuotedMappingSM>{};
358
350
alpaka::exec<Acc>(queue, workdiv, updateKernel, llama::shallowCopy (accView));
359
351
statsUpdate (watch.printAndReset (" update" , ' \t ' ));
360
352
}
0 commit comments