Skip to content

Commit 65db421

Browse files
authored
Add more profiling instrumentation in the collision kernel (BLAST-WarpX#5766)
1 parent a28b20a commit 65db421

File tree

1 file changed

+18
-2
lines changed

1 file changed

+18
-2
lines changed

Source/Particles/Collision/BinaryCollision/BinaryCollision.H

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,8 @@ public:
271271
using namespace ParticleUtils;
272272
using namespace amrex::literals;
273273

274+
WARPX_PROFILE("BinaryCollision::doCollisionsWithinTile");
275+
274276
const auto& binary_collision_functor = m_binary_collision_functor.executor();
275277
const bool have_product_species = m_have_product_species;
276278

@@ -320,7 +322,9 @@ public:
320322
ParticleTileType& ptile_1 = species_1.ParticlesAt(lev, mfi);
321323

322324
// Find the particles that are in each cell of this tile
325+
WARPX_PROFILE_VAR("BinaryCollision::doCollisionsWithinTile::findParticlesInEachCell", prof_findParticlesInEachCell);
323326
ParticleBins bins_1 = findParticlesInEachCell( lev, mfi, ptile_1 );
327+
WARPX_PROFILE_VAR_STOP(prof_findParticlesInEachCell);
324328

325329
// Loop over cells, and collide the particles in each cell
326330

@@ -344,6 +348,7 @@ public:
344348
// Compute how many pairs in each cell and store in n_pairs_in_each_cell array
345349
// For a single species, the number of pair in a cell is half the number of particles
346350
// in that cell, rounded up to the next higher integer.
351+
WARPX_PROFILE_VAR("BinaryCollision::doCollisionsWithinTile::computeNumberOfPairs", prof_computeNumberOfPairs);
347352
amrex::ParallelFor( n_cells_products,
348353
[=] AMREX_GPU_DEVICE (int i_cell) noexcept
349354
{
@@ -378,6 +383,7 @@ public:
378383
const auto n_independent_pairs = (int) amrex::Scan::ExclusiveSum(n_cells+1,
379384
p_n_ind_pairs_in_each_cell, coll_offsets.data(), amrex::Scan::RetSum{true});
380385
index_type* AMREX_RESTRICT p_coll_offsets = coll_offsets.dataPtr();
386+
WARPX_PROFILE_VAR_STOP(prof_computeNumberOfPairs);
381387

382388
// mask: equal to 1 if particle creation occurs for a given pair, 0 otherwise
383389
amrex::Gpu::DeviceVector<index_type> mask(n_total_pairs);
@@ -410,6 +416,7 @@ public:
410416
amrex::ParticleReal* AMREX_RESTRICT T1_in_each_cell = T1_vec.dataPtr();
411417

412418
// Loop over cells
419+
WARPX_PROFILE_VAR("BinaryCollision::doCollisionsWithinTile::computeDensityTemperatures", prof_computeDensityTemperatures);
413420
amrex::ParallelForRNG( n_cells,
414421
[=] AMREX_GPU_DEVICE (int i_cell, amrex::RandomEngine const& engine) noexcept
415422
{
@@ -446,6 +453,7 @@ public:
446453
ShuffleFisherYates(indices_1, cell_start_1, cell_stop_1, engine);
447454
}
448455
);
456+
WARPX_PROFILE_VAR_STOP(prof_computeDensityTemperatures);
449457

450458
// Loop over independent particle pairs
451459
// To speed up binary collisions on GPU, we try to expose as much parallelism
@@ -454,6 +462,7 @@ public:
454462
// that do not touch the same macroparticles, so that there is no race condition),
455463
// where the number of independent pairs is determined by the lower number of
456464
// macroparticles of either species, within each cell.
465+
WARPX_PROFILE_VAR("BinaryCollision::doCollisionsWithinTile::LoopOverCollisions", prof_loopOverCollisions);
457466
amrex::ParallelForRNG( n_independent_pairs,
458467
[=] AMREX_GPU_DEVICE (int i_coll, amrex::RandomEngine const& engine) noexcept
459468
{
@@ -500,7 +509,7 @@ public:
500509
p_pair_reaction_weight, engine);
501510
}
502511
);
503-
512+
WARPX_PROFILE_VAR_STOP(prof_loopOverCollisions);
504513
// Create the new product particles and define their initial values
505514
// num_added: how many particles of each product species have been created
506515
const amrex::Vector<int> num_added = m_copy_transform_functor(n_total_pairs,
@@ -525,8 +534,10 @@ public:
525534
ParticleTileType& ptile_2 = species_2.ParticlesAt(lev, mfi);
526535

527536
// Find the particles that are in each cell of this tile
537+
WARPX_PROFILE_VAR("BinaryCollision::doCollisionsWithinTile::findParticlesInEachCell", prof_findParticlesInEachCell);
528538
ParticleBins bins_1 = findParticlesInEachCell( lev, mfi, ptile_1 );
529539
ParticleBins bins_2 = findParticlesInEachCell( lev, mfi, ptile_2 );
540+
WARPX_PROFILE_VAR_STOP(prof_findParticlesInEachCell);
530541

531542
// Loop over cells, and collide the particles in each cell
532543

@@ -557,6 +568,7 @@ public:
557568
// Compute how many pairs in each cell and store in n_pairs_in_each_cell array
558569
// For different species, the number of pairs in a cell is the number of particles of
559570
// the species that has the most particles in that cell
571+
WARPX_PROFILE_VAR("BinaryCollision::doCollisionsWithinTile::computeNumberOfPairs", prof_computeNumberOfPairs);
560572
amrex::ParallelFor( n_cells_products,
561573
[=] AMREX_GPU_DEVICE (int i_cell) noexcept
562574
{
@@ -604,6 +616,7 @@ public:
604616
const auto n_independent_pairs = (int) amrex::Scan::ExclusiveSum(n_cells+1,
605617
p_n_ind_pairs_in_each_cell, coll_offsets.data(), amrex::Scan::RetSum{true});
606618
index_type* AMREX_RESTRICT p_coll_offsets = coll_offsets.dataPtr();
619+
WARPX_PROFILE_VAR_STOP(prof_computeNumberOfPairs);
607620

608621
// mask: equal to 1 if particle creation occurs for a given pair, 0 otherwise
609622
amrex::Gpu::DeviceVector<index_type> mask(n_total_pairs);
@@ -640,6 +653,7 @@ public:
640653
amrex::ParticleReal* AMREX_RESTRICT T2_in_each_cell = T2_vec.dataPtr();
641654

642655
// Loop over cells
656+
WARPX_PROFILE_VAR("BinaryCollision::doCollisionsWithinTile::findDensityTemperatures", prof_findDensityTemperatures);
643657
amrex::ParallelForRNG( n_cells,
644658
[=] AMREX_GPU_DEVICE (int i_cell, amrex::RandomEngine const& engine) noexcept
645659
{
@@ -697,14 +711,15 @@ public:
697711
ShuffleFisherYates(indices_2, cell_start_2, cell_stop_2, engine);
698712
}
699713
);
700-
714+
WARPX_PROFILE_VAR_STOP(prof_findDensityTemperatures);
701715
// Loop over independent particle pairs
702716
// To speed up binary collisions on GPU, we try to expose as much parallelism
703717
// as possible (while avoiding race conditions): Instead of looping with one GPU
704718
// thread per cell, we loop with one GPU thread per "independent pairs" (i.e. pairs
705719
// that do not touch the same macroparticles, so that there is no race condition),
706720
// where the number of independent pairs is determined by the lower number of
707721
// macroparticles of either species, within each cell.
722+
WARPX_PROFILE_VAR("BinaryCollision::doCollisionsWithinTile::LoopOverCollisions", prof_loopOverCollisions);
708723
amrex::ParallelForRNG( n_independent_pairs,
709724
[=] AMREX_GPU_DEVICE (int i_coll, amrex::RandomEngine const& engine) noexcept
710725
{
@@ -758,6 +773,7 @@ public:
758773
p_pair_reaction_weight, engine);
759774
}
760775
);
776+
WARPX_PROFILE_VAR_STOP(prof_loopOverCollisions);
761777

762778
// Create the new product particles and define their initial values
763779
// num_added: how many particles of each product species have been created

0 commit comments

Comments
 (0)