@@ -271,6 +271,8 @@ public:
271271 using namespace ParticleUtils ;
272272 using namespace amrex ::literals;
273273
274+ WARPX_PROFILE (" BinaryCollision::doCollisionsWithinTile" );
275+
274276 const auto & binary_collision_functor = m_binary_collision_functor.executor ();
275277 const bool have_product_species = m_have_product_species;
276278
@@ -320,7 +322,9 @@ public:
320322 ParticleTileType& ptile_1 = species_1.ParticlesAt (lev, mfi);
321323
322324 // Find the particles that are in each cell of this tile
325+ WARPX_PROFILE_VAR (" BinaryCollision::doCollisionsWithinTile::findParticlesInEachCell" , prof_findParticlesInEachCell);
323326 ParticleBins bins_1 = findParticlesInEachCell ( lev, mfi, ptile_1 );
327+ WARPX_PROFILE_VAR_STOP (prof_findParticlesInEachCell);
324328
325329 // Loop over cells, and collide the particles in each cell
326330
@@ -344,6 +348,7 @@ public:
344348 // Compute how many pairs in each cell and store in n_pairs_in_each_cell array
345349 // For a single species, the number of pair in a cell is half the number of particles
346350 // in that cell, rounded up to the next higher integer.
351+ WARPX_PROFILE_VAR (" BinaryCollision::doCollisionsWithinTile::computeNumberOfPairs" , prof_computeNumberOfPairs);
347352 amrex::ParallelFor ( n_cells_products,
348353 [=] AMREX_GPU_DEVICE (int i_cell) noexcept
349354 {
@@ -378,6 +383,7 @@ public:
378383 const auto n_independent_pairs = (int ) amrex::Scan::ExclusiveSum (n_cells+1 ,
379384 p_n_ind_pairs_in_each_cell, coll_offsets.data (), amrex::Scan::RetSum{true });
380385 index_type* AMREX_RESTRICT p_coll_offsets = coll_offsets.dataPtr ();
386+ WARPX_PROFILE_VAR_STOP (prof_computeNumberOfPairs);
381387
382388 // mask: equal to 1 if particle creation occurs for a given pair, 0 otherwise
383389 amrex::Gpu::DeviceVector<index_type> mask (n_total_pairs);
@@ -410,6 +416,7 @@ public:
410416 amrex::ParticleReal* AMREX_RESTRICT T1_in_each_cell = T1_vec.dataPtr ();
411417
412418 // Loop over cells
419+ WARPX_PROFILE_VAR (" BinaryCollision::doCollisionsWithinTile::computeDensityTemperatures" , prof_computeDensityTemperatures);
413420 amrex::ParallelForRNG ( n_cells,
414421 [=] AMREX_GPU_DEVICE (int i_cell, amrex::RandomEngine const & engine) noexcept
415422 {
@@ -446,6 +453,7 @@ public:
446453 ShuffleFisherYates (indices_1, cell_start_1, cell_stop_1, engine);
447454 }
448455 );
456+ WARPX_PROFILE_VAR_STOP (prof_computeDensityTemperatures);
449457
450458 // Loop over independent particle pairs
451459 // To speed up binary collisions on GPU, we try to expose as much parallelism
@@ -454,6 +462,7 @@ public:
454462 // that do not touch the same macroparticles, so that there is no race condition),
455463 // where the number of independent pairs is determined by the lower number of
456464 // macroparticles of either species, within each cell.
465+ WARPX_PROFILE_VAR (" BinaryCollision::doCollisionsWithinTile::LoopOverCollisions" , prof_loopOverCollisions);
457466 amrex::ParallelForRNG ( n_independent_pairs,
458467 [=] AMREX_GPU_DEVICE (int i_coll, amrex::RandomEngine const & engine) noexcept
459468 {
@@ -500,7 +509,7 @@ public:
500509 p_pair_reaction_weight, engine);
501510 }
502511 );
503-
512+ WARPX_PROFILE_VAR_STOP (prof_loopOverCollisions);
504513 // Create the new product particles and define their initial values
505514 // num_added: how many particles of each product species have been created
506515 const amrex::Vector<int > num_added = m_copy_transform_functor (n_total_pairs,
@@ -525,8 +534,10 @@ public:
525534 ParticleTileType& ptile_2 = species_2.ParticlesAt (lev, mfi);
526535
527536 // Find the particles that are in each cell of this tile
537+ WARPX_PROFILE_VAR (" BinaryCollision::doCollisionsWithinTile::findParticlesInEachCell" , prof_findParticlesInEachCell);
528538 ParticleBins bins_1 = findParticlesInEachCell ( lev, mfi, ptile_1 );
529539 ParticleBins bins_2 = findParticlesInEachCell ( lev, mfi, ptile_2 );
540+ WARPX_PROFILE_VAR_STOP (prof_findParticlesInEachCell);
530541
531542 // Loop over cells, and collide the particles in each cell
532543
@@ -557,6 +568,7 @@ public:
557568 // Compute how many pairs in each cell and store in n_pairs_in_each_cell array
558569 // For different species, the number of pairs in a cell is the number of particles of
559570 // the species that has the most particles in that cell
571+ WARPX_PROFILE_VAR (" BinaryCollision::doCollisionsWithinTile::computeNumberOfPairs" , prof_computeNumberOfPairs);
560572 amrex::ParallelFor ( n_cells_products,
561573 [=] AMREX_GPU_DEVICE (int i_cell) noexcept
562574 {
@@ -604,6 +616,7 @@ public:
604616 const auto n_independent_pairs = (int ) amrex::Scan::ExclusiveSum (n_cells+1 ,
605617 p_n_ind_pairs_in_each_cell, coll_offsets.data (), amrex::Scan::RetSum{true });
606618 index_type* AMREX_RESTRICT p_coll_offsets = coll_offsets.dataPtr ();
619+ WARPX_PROFILE_VAR_STOP (prof_computeNumberOfPairs);
607620
608621 // mask: equal to 1 if particle creation occurs for a given pair, 0 otherwise
609622 amrex::Gpu::DeviceVector<index_type> mask (n_total_pairs);
@@ -640,6 +653,7 @@ public:
640653 amrex::ParticleReal* AMREX_RESTRICT T2_in_each_cell = T2_vec.dataPtr ();
641654
642655 // Loop over cells
656+ WARPX_PROFILE_VAR (" BinaryCollision::doCollisionsWithinTile::findDensityTemperatures" , prof_findDensityTemperatures);
643657 amrex::ParallelForRNG ( n_cells,
644658 [=] AMREX_GPU_DEVICE (int i_cell, amrex::RandomEngine const & engine) noexcept
645659 {
@@ -697,14 +711,15 @@ public:
697711 ShuffleFisherYates (indices_2, cell_start_2, cell_stop_2, engine);
698712 }
699713 );
700-
714+ WARPX_PROFILE_VAR_STOP (prof_findDensityTemperatures);
701715 // Loop over independent particle pairs
702716 // To speed up binary collisions on GPU, we try to expose as much parallelism
703717 // as possible (while avoiding race conditions): Instead of looping with one GPU
704718 // thread per cell, we loop with one GPU thread per "independent pairs" (i.e. pairs
705719 // that do not touch the same macroparticles, so that there is no race condition),
706720 // where the number of independent pairs is determined by the lower number of
707721 // macroparticles of either species, within each cell.
722+ WARPX_PROFILE_VAR (" BinaryCollision::doCollisionsWithinTile::LoopOverCollisions" , prof_loopOverCollisions);
708723 amrex::ParallelForRNG ( n_independent_pairs,
709724 [=] AMREX_GPU_DEVICE (int i_coll, amrex::RandomEngine const & engine) noexcept
710725 {
@@ -758,6 +773,7 @@ public:
758773 p_pair_reaction_weight, engine);
759774 }
760775 );
776+ WARPX_PROFILE_VAR_STOP (prof_loopOverCollisions);
761777
762778 // Create the new product particles and define their initial values
763779 // num_added: how many particles of each product species have been created
0 commit comments