@@ -641,8 +641,8 @@ PhysicalParticleContainer::AddPlasma (int lev, RealBox part_realbox)
641641 overlap_realbox.lo (2 ))};
642642
643643 // count the number of particles that each cell in overlap_box could add
644- Gpu::DeviceVector<int > counts (overlap_box.numPts ()+ 1 , 0 );
645- Gpu::DeviceVector<int > offset (overlap_box.numPts ()+ 1 , 0 );
644+ Gpu::DeviceVector<int > counts (overlap_box.numPts (), 0 );
645+ Gpu::DeviceVector<int > offset (overlap_box.numPts ());
646646 auto pcounts = counts.data ();
647647 int lrrfac = rrfac;
648648 int lrefine_injection = refine_injection;
@@ -674,16 +674,10 @@ PhysicalParticleContainer::AddPlasma (int lev, RealBox part_realbox)
674674 amrex::ignore_unused (k);
675675#endif
676676 });
677- Gpu::exclusive_scan (counts.begin (), counts.end (), offset.begin ());
678677
679678 // Max number of new particles. All of them are created,
680679 // and invalid ones are then discarded
681- int max_new_particles;
682- #ifdef AMREX_USE_GPU
683- Gpu::dtoh_memcpy (&max_new_particles, offset.dataPtr ()+overlap_box.numPts (), sizeof (int ));
684- #else
685- std::memcpy (&max_new_particles, offset.dataPtr ()+overlap_box.numPts (), sizeof (int ));
686- #endif
680+ int max_new_particles = Scan::ExclusiveSum (counts.size (), counts.data (), offset.data ());
687681
688682 // Update NextID to include particles created in this function
689683 Long pid;
@@ -913,13 +907,13 @@ PhysicalParticleContainer::AddPlasma (int lev, RealBox part_realbox)
913907 }
914908 });
915909
910+ amrex::Gpu::synchronize ();
911+
916912 if (cost && WarpX::load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::Timers)
917913 {
918- amrex::Gpu::synchronize ();
919914 wt = amrex::second () - wt;
920915 amrex::HostDevice::Atomic::Add ( &(*cost)[mfi.index ()], wt);
921916 }
922- amrex::Gpu::synchronize ();
923917 }
924918
925919 // The function that calls this is responsible for redistributing particles.
@@ -1149,9 +1143,10 @@ PhysicalParticleContainer::Evolve (int lev,
11491143 }
11501144 }
11511145
1146+ amrex::Gpu::synchronize ();
1147+
11521148 if (cost && WarpX::load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::Timers)
11531149 {
1154- amrex::Gpu::synchronize ();
11551150 wt = amrex::second () - wt;
11561151 amrex::HostDevice::Atomic::Add ( &(*cost)[pti.index ()], wt);
11571152 }
@@ -1255,7 +1250,7 @@ PhysicalParticleContainer::SplitParticles (int lev)
12551250 long np_split;
12561251 if (split_type==0 )
12571252 {
1258- np_split = pow ( 2 , AMREX_SPACEDIM) ;
1253+ np_split = ( AMREX_SPACEDIM == 3 ) ? 8 : 4 ;
12591254 } else {
12601255 np_split = 2 *AMREX_SPACEDIM;
12611256 }
@@ -1599,8 +1594,8 @@ PhysicalParticleContainer::GetParticleSlice (
15991594 // from going out of scope after each iteration, while the kernels
16001595 // may still need access to them.
16011596 // Note that the destructor for WarpXParIter is synchronized.
1602- amrex::Gpu::ManagedDeviceVector <int > FlagForPartCopy;
1603- amrex::Gpu::ManagedDeviceVector <int > IndexForPartCopy;
1597+ amrex::Gpu::DeviceVector <int > FlagForPartCopy;
1598+ amrex::Gpu::DeviceVector <int > IndexForPartCopy;
16041599 for (WarpXParIter pti (*this , lev); pti.isValid (); ++pti)
16051600 {
16061601 const Box& box = pti.validbox ();
@@ -1658,9 +1653,7 @@ PhysicalParticleContainer::GetParticleSlice (
16581653 // exclusive scan to obtain location indices using flag values
16591654 // These location indices are used to copy data from
16601655 // src to dst when the copy-flag is set to 1.
1661- amrex::Gpu::exclusive_scan (Flag,Flag+np,IndexLocation);
1662-
1663- const int total_partdiag_size = IndexLocation[np-1 ] + Flag[np-1 ];
1656+ const int total_partdiag_size = amrex::Scan::ExclusiveSum (np,Flag,IndexLocation);
16641657
16651658 // allocate array size for diagnostic particle array
16661659 diagnostic_particles[lev][index].resize (total_partdiag_size);
@@ -1740,6 +1733,7 @@ PhysicalParticleContainer::GetParticleSlice (
17401733 diag_uzp[loc] = uzp;
17411734 }
17421735 });
1736+ Gpu::synchronize (); // because of FlagForPartCopy & IndexForPartCopy
17431737 }
17441738 }
17451739 }
@@ -1936,10 +1930,10 @@ PhysicalParticleContainer::InitIonizationModule ()
19361930 // Get atomic number and ionization energies from file
19371931 int ion_element_id = ion_map_ids[physical_element];
19381932 ion_atomic_number = ion_atomic_numbers[ion_element_id];
1939- ionization_energies. resize (ion_atomic_number);
1933+ Vector<Real> h_ionization_energies (ion_atomic_number);
19401934 int offset = ion_energy_offsets[ion_element_id];
19411935 for (int i=0 ; i<ion_atomic_number; i++){
1942- ionization_energies [i] = table_ionization_energies[i+offset];
1936+ h_ionization_energies [i] = table_ionization_energies[i+offset];
19431937 }
19441938 // Compute ADK prefactors (See Chen, JCP 236 (2013), equation (2))
19451939 // For now, we assume l=0 and m=0.
@@ -1949,22 +1943,35 @@ PhysicalParticleContainer::InitIonizationModule ()
19491943 Real Ea = PhysConst::m_e * PhysConst::c*PhysConst::c /PhysConst::q_e *
19501944 std::pow (PhysConst::alpha,4 )/PhysConst::r_e;
19511945 Real UH = table_ionization_energies[0 ];
1952- Real l_eff = std::sqrt (UH/ionization_energies [0 ]) - 1 .;
1946+ Real l_eff = std::sqrt (UH/h_ionization_energies [0 ]) - 1 .;
19531947
19541948 const Real dt = WarpX::GetInstance ().getdt (0 );
19551949
1950+ ionization_energies.resize (ion_atomic_number);
19561951 adk_power.resize (ion_atomic_number);
19571952 adk_prefactor.resize (ion_atomic_number);
19581953 adk_exp_prefactor.resize (ion_atomic_number);
1959- for (int i=0 ; i<ion_atomic_number; ++i){
1960- Real n_eff = (i+1 ) * std::sqrt (UH/ionization_energies[i]);
1954+
1955+ Gpu::copyAsync (Gpu::hostToDevice,
1956+ h_ionization_energies.begin (), h_ionization_energies.end (),
1957+ ionization_energies.begin ());
1958+
1959+ Real const * AMREX_RESTRICT p_ionization_energies = ionization_energies.data ();
1960+ Real * AMREX_RESTRICT p_adk_power = adk_power.data ();
1961+ Real * AMREX_RESTRICT p_adk_prefactor = adk_prefactor.data ();
1962+ Real * AMREX_RESTRICT p_adk_exp_prefactor = adk_exp_prefactor.data ();
1963+ amrex::ParallelFor (ion_atomic_number, [=] AMREX_GPU_DEVICE (int i) noexcept
1964+ {
1965+ Real n_eff = (i+1 ) * std::sqrt (UH/p_ionization_energies[i]);
19611966 Real C2 = std::pow (2 ,2 *n_eff)/(n_eff*tgamma (n_eff+l_eff+1 )*tgamma (n_eff-l_eff));
1962- adk_power [i] = -(2 *n_eff - 1 );
1963- Real Uion = ionization_energies [i];
1964- adk_prefactor [i] = dt * wa * C2 * ( Uion/(2 *UH) )
1967+ p_adk_power [i] = -(2 *n_eff - 1 );
1968+ Real Uion = p_ionization_energies [i];
1969+ p_adk_prefactor [i] = dt * wa * C2 * ( Uion/(2 *UH) )
19651970 * std::pow (2 *std::pow ((Uion/UH),3 ./2 )*Ea,2 *n_eff - 1 );
1966- adk_exp_prefactor[i] = -2 ./3 * std::pow ( Uion/UH,3 ./2 ) * Ea;
1967- }
1971+ p_adk_exp_prefactor[i] = -2 ./3 * std::pow ( Uion/UH,3 ./2 ) * Ea;
1972+ });
1973+
1974+ Gpu::synchronize ();
19681975}
19691976
19701977IonizationFilterFunc
0 commit comments