@@ -662,107 +662,113 @@ void UserMeshWorkBeforeOutput(Mesh *pmesh, ParameterInput *pin,
662662 // define the heffte class and the input and output geometry
663663 heffte::fft3d_r2c<backend_tag> fft (inbox, outbox, r2c_direction, comm);
664664
665- // vectors with the correct sizes to store the input and output data
666- // std::vector<Real> input(fft.size_inbox());
665+ // TODO(pgrete) Eventually make these persistent
667666 int n_comp = 3 ;
668- parthenon::HostArray1D<Real> input (" fft input" , n_comp * fft.size_inbox ());
669- parthenon::HostArray1D<Real> inverse (" fft inverse" , n_comp * fft.size_inbox ());
670- parthenon::HostArray1D<std::complex <Real>> output (" fft output" ,
671- n_comp * fft.size_outbox ());
672- parthenon::HostArray1D<std::complex <Real>> workspace (" fft workspace" ,
673- n_comp * fft.size_workspace ());
667+ const auto fft_size_inbox = fft.size_inbox ();
668+ parthenon::ParArray1D<Real> input (" fft input" , n_comp * fft_size_inbox);
669+ parthenon::ParArray1D<Real> inverse (" fft inverse" , n_comp * fft_size_inbox);
670+ parthenon::ParArray1D<std::complex <Real>> output (" fft output" ,
671+ n_comp * fft.size_outbox ());
672+ parthenon::ParArray1D<std::complex <Real>> workspace (" fft workspace" ,
673+ n_comp * fft.size_workspace ());
674674 PARTHENON_REQUIRE_THROWS (pmesh->DefaultNumPartitions () == 1 ,
675675 " Only pack_size=-1 currently supported for heffte." )
676676 auto &md = pmesh->mesh_data .GetOrAdd (" base" , 0 );
677677 IndexRange ib = md->GetBlockData (0 )->GetBoundsI (IndexDomain::interior);
678678 IndexRange jb = md->GetBlockData (0 )->GetBoundsJ (IndexDomain::interior);
679679 IndexRange kb = md->GetBlockData (0 )->GetBoundsK (IndexDomain::interior);
680680 auto prim = md->PackVariables (std::vector<std::string>{" prim" });
681- Real realsum = 0.0 ;
682- for (int b = 0 ; b < pmesh->GetNumMeshBlocksThisRank (); b++) {
683- for (int k = kb.s ; k <= kb.e ; k++)
684- for (int j = jb.s ; j <= jb.e ; j++)
685- for (int i = ib.s ; i <= ib.e ; i++) {
686- const auto &p = prim (b);
687- const auto kk = k - kb.s + loc_view (b, 2 ) * nx3b;
688- const auto jj = j - jb.s + loc_view (b, 1 ) * nx2b;
689- const auto ii = i - ib.s + loc_view (b, 0 ) * nx1b;
690- const std::int64_t idx = (kk * nx2l + jj) * nx1l + ii;
691- input (idx) = p (IV1, k, j, i);
692- input (idx + fft.size_inbox ()) = p (IV2, k, j, i);
693- input (idx + 2 * fft.size_inbox ()) = p (IV3, k, j, i);
694- realsum += SQR (p (IDN, k, j, i) - 1.0 );
695- }
696- }
681+ par_for (
682+ " Init FFT fields" , 0 , pmesh->GetNumMeshBlocksThisRank () - 1 , kb.s , kb.e , jb.s , jb.e ,
683+ ib.s , ib.e , KOKKOS_LAMBDA (const int b, const int k, const int j, const int i) {
684+ const auto &p = prim (b);
685+ const auto kk = k - kb.s + loc_view (b, 2 ) * nx3b;
686+ const auto jj = j - jb.s + loc_view (b, 1 ) * nx2b;
687+ const auto ii = i - ib.s + loc_view (b, 0 ) * nx1b;
688+ const std::int64_t idx = (kk * nx2l + jj) * nx1l + ii;
689+ input (idx) = p (IV1, k, j, i);
690+ input (idx + fft_size_inbox) = p (IV2, k, j, i);
691+ input (idx + 2 * fft_size_inbox) = p (IV3, k, j, i);
692+ });
697693
698694 fft.forward (n_comp, input.data (), output.data (), workspace.data ());
699695
700- auto k_max = std::sqrt (SQR (gnx1 / 2 ) + SQR (gnx2 / 2 ) + SQR (gnx3 / 2 ));
701- Real mysum = 0 ;
702- parthenon::HostArray1D<Real> spec (" spectrum" , static_cast <int >(std::ceil (k_max)) + 1 );
703- for (int k = outbox.low [2 ]; k <= outbox.high [2 ]; k++)
704- for (int j = outbox.low [1 ]; j <= outbox.high [1 ]; j++)
705- for (int i = outbox.low [0 ]; i <= outbox.high [0 ]; i++) {
696+ const auto k_max = std::sqrt (SQR (gnx1 / 2 ) + SQR (gnx2 / 2 ) + SQR (gnx3 / 2 ));
697+
698+ const auto num_bins = static_cast <int >(std::ceil (k_max)) + 1 ;
699+ // TODO(pgrete) if these are being reused, then ensure to reset (i.e., init 0 to and
700+ // call .reset())
701+ ParArray2D<Real> spectra (" spectra" , num_bins, 3 );
702+ // temp view for reduction for better performance (switches
703+ // between atomics and data duplication depending on the platform)
704+ auto scatter_spectra =
705+ Kokkos::Experimental::ScatterView<Real **, parthenon::LayoutWrapper>(
706+ spectra.KokkosView ());
707+
708+ ib.s = outbox.low [0 ];
709+ ib.e = outbox.high [0 ];
710+ jb.s = outbox.low [1 ];
711+ jb.e = outbox.high [1 ];
712+ kb.s = outbox.low [2 ];
713+ kb.e = outbox.high [2 ];
714+ const auto fft_size_outbox = fft.size_outbox ();
715+ parthenon::par_for (
716+ " CalcSpec" , kb.s , kb.e , jb.s , jb.e , ib.s , ib.e ,
717+ KOKKOS_LAMBDA (const int k, const int j, const int i) {
706718 auto k_z = k <= gnx3 / 2 ? k : -gnx3 + k;
707719 auto k_y = j <= gnx2 / 2 ? j : -gnx2 + j;
708720 auto k_x = i; // because we're using r2c transforms
709721
710722 // for simple binning/indexing
711- auto k_mag =
712- static_cast <int >(std::floor (std::sqrt ( SQR (k_x) + SQR (k_y) + SQR (k_z)) ));
723+ auto k_mag = std::sqrt ( SQR (k_x) + SQR (k_y) + SQR (k_z));
724+ auto k_mag_int = static_cast <int >(std::floor (k_mag ));
713725
714- const auto outidx = ((k - outbox.low [2 ]) * outbox.size [1 ] + (j - outbox.low [1 ])) *
715- outbox.size [0 ] +
716- i - outbox.low [0 ];
726+ const auto outidx =
727+ ((k - kb.s ) * (jb.e - jb.s + 1 ) + (j - jb.s )) * (ib.e - ib.s + 1 ) + i - ib.s ;
717728
718729 auto val = SQR (std::abs (output[outidx])) +
719- SQR (std::abs (output[outidx + fft.size_outbox ()])) +
720- SQR (std::abs (output[outidx + 2 * fft.size_outbox ()]));
730+ SQR (std::abs (output[outidx + fft_size_outbox])) +
731+ SQR (std::abs (output[outidx + 2 * fft_size_outbox]));
732+
721733 // account for Hermitian symmetry of r2c transform
722- if ((k_x > 0 ) && (2 * k_x != gnx1)) {
723- val *= 2.0 ;
724- }
734+ const auto fac = ((k_x > 0 ) && (2 * k_x != gnx1)) ? 2.0 : 1.0 ;
725735
726- spec (k_mag) += val;
727- if (k_mag > 0.0 ) {
728- mysum += val;
729- } else {
730- std::cerr << " [" << parthenon::Globals::my_rank
731- << " ] mode 0 is: " << std::sqrt (val) << " \n " ;
732- }
733- }
736+ auto spec = scatter_spectra.access ();
737+ // 0: histsum - 1: ksum - 2: histcount
738+ spec (k_mag_int, 0 ) += fac * val;
739+ spec (k_mag_int, 1 ) += fac * k_mag;
740+ spec (k_mag_int, 2 ) += fac * 1.0 ;
741+ });
742+ Kokkos::Experimental::contribute (spectra.KokkosView (), scatter_spectra);
734743
735- Kokkos::fence ();
736- std::cerr << " [" << parthenon::Globals::my_rank
737- << " ] my sum is: " << mysum / (gnx3 * gnx2 * gnx1) << " and the realsum is "
738- << realsum << " \n " ;
744+ Kokkos::fence (); // May not be required.
739745#ifdef MPI_PARALLEL
740746 // Sum the perturbations over all processors
741747 if (parthenon::Globals::my_rank == 0 ) {
742- PARTHENON_MPI_CHECK (MPI_Reduce (MPI_IN_PLACE, spec .data (), spec .size (),
748+ PARTHENON_MPI_CHECK (MPI_Reduce (MPI_IN_PLACE, spectra .data (), spectra .size (),
743749 MPI_PARTHENON_REAL, MPI_SUM, 0 , MPI_COMM_WORLD));
744750 } else {
745- PARTHENON_MPI_CHECK (MPI_Reduce (spec .data (), spec .data (), spec .size (),
751+ PARTHENON_MPI_CHECK (MPI_Reduce (spectra .data (), spectra .data (), spectra .size (),
746752 MPI_PARTHENON_REAL, MPI_SUM, 0 , MPI_COMM_WORLD));
747753 }
748754#endif // MPI_PARALLEL
749755
750756 if (parthenon::Globals::my_rank == 0 ) {
751-
757+ auto spectra_h = spectra. GetHostMirrorAndCopy ();
752758 // and write data
753759 std::ofstream outfile;
754760 const std::string fname (" spec.csv" );
755761 // On startup, write header
756762 if (tm.ncycle == 0 ) {
757763 outfile.open (fname, std::ofstream::out);
758- outfile << " # cycle, time, pos spec,...\n " ;
764+ outfile << " # cycle, time, num_bins, pos spec,...\n " ;
759765 } else {
760766 outfile.open (fname, std::ofstream::out | std::ofstream::app);
761767 }
762768
763- outfile << tm.ncycle << " ," << tm.time ;
764- for (int i = 0 ; i < spec .size (); i++) {
765- outfile << " ," << spec (i);
769+ outfile << tm.ncycle << " ," << tm.time << " , " << num_bins ;
770+ for (int i = 0 ; i < spectra_h .size (); i++) {
771+ outfile << " ," << spectra_h (i);
766772 }
767773 outfile << std::endl;
768774
0 commit comments