Skip to content

Commit 3c189e3

Browse files
committed
Use par_fors in spectra and ffts
1 parent 3e726e6 commit 3c189e3

1 file changed

Lines changed: 65 additions & 59 deletions

File tree

src/pgen/turbulence.cpp

Lines changed: 65 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -662,107 +662,113 @@ void UserMeshWorkBeforeOutput(Mesh *pmesh, ParameterInput *pin,
662662
// define the heffte class and the input and output geometry
663663
heffte::fft3d_r2c<backend_tag> fft(inbox, outbox, r2c_direction, comm);
664664

665-
// vectors with the correct sizes to store the input and output data
666-
// std::vector<Real> input(fft.size_inbox());
665+
// TODO(pgrete) Eventually make these persistent
667666
int n_comp = 3;
668-
parthenon::HostArray1D<Real> input("fft input", n_comp * fft.size_inbox());
669-
parthenon::HostArray1D<Real> inverse("fft inverse", n_comp * fft.size_inbox());
670-
parthenon::HostArray1D<std::complex<Real>> output("fft output",
671-
n_comp * fft.size_outbox());
672-
parthenon::HostArray1D<std::complex<Real>> workspace("fft workspace",
673-
n_comp * fft.size_workspace());
667+
const auto fft_size_inbox = fft.size_inbox();
668+
parthenon::ParArray1D<Real> input("fft input", n_comp * fft_size_inbox);
669+
parthenon::ParArray1D<Real> inverse("fft inverse", n_comp * fft_size_inbox);
670+
parthenon::ParArray1D<std::complex<Real>> output("fft output",
671+
n_comp * fft.size_outbox());
672+
parthenon::ParArray1D<std::complex<Real>> workspace("fft workspace",
673+
n_comp * fft.size_workspace());
674674
PARTHENON_REQUIRE_THROWS(pmesh->DefaultNumPartitions() == 1,
675675
"Only pack_size=-1 currently supported for heffte.")
676676
auto &md = pmesh->mesh_data.GetOrAdd("base", 0);
677677
IndexRange ib = md->GetBlockData(0)->GetBoundsI(IndexDomain::interior);
678678
IndexRange jb = md->GetBlockData(0)->GetBoundsJ(IndexDomain::interior);
679679
IndexRange kb = md->GetBlockData(0)->GetBoundsK(IndexDomain::interior);
680680
auto prim = md->PackVariables(std::vector<std::string>{"prim"});
681-
Real realsum = 0.0;
682-
for (int b = 0; b < pmesh->GetNumMeshBlocksThisRank(); b++) {
683-
for (int k = kb.s; k <= kb.e; k++)
684-
for (int j = jb.s; j <= jb.e; j++)
685-
for (int i = ib.s; i <= ib.e; i++) {
686-
const auto &p = prim(b);
687-
const auto kk = k - kb.s + loc_view(b, 2) * nx3b;
688-
const auto jj = j - jb.s + loc_view(b, 1) * nx2b;
689-
const auto ii = i - ib.s + loc_view(b, 0) * nx1b;
690-
const std::int64_t idx = (kk * nx2l + jj) * nx1l + ii;
691-
input(idx) = p(IV1, k, j, i);
692-
input(idx + fft.size_inbox()) = p(IV2, k, j, i);
693-
input(idx + 2 * fft.size_inbox()) = p(IV3, k, j, i);
694-
realsum += SQR(p(IDN, k, j, i) - 1.0);
695-
}
696-
}
681+
par_for(
682+
"Init FFT fields", 0, pmesh->GetNumMeshBlocksThisRank() - 1, kb.s, kb.e, jb.s, jb.e,
683+
ib.s, ib.e, KOKKOS_LAMBDA(const int b, const int k, const int j, const int i) {
684+
const auto &p = prim(b);
685+
const auto kk = k - kb.s + loc_view(b, 2) * nx3b;
686+
const auto jj = j - jb.s + loc_view(b, 1) * nx2b;
687+
const auto ii = i - ib.s + loc_view(b, 0) * nx1b;
688+
const std::int64_t idx = (kk * nx2l + jj) * nx1l + ii;
689+
input(idx) = p(IV1, k, j, i);
690+
input(idx + fft_size_inbox) = p(IV2, k, j, i);
691+
input(idx + 2 * fft_size_inbox) = p(IV3, k, j, i);
692+
});
697693

698694
fft.forward(n_comp, input.data(), output.data(), workspace.data());
699695

700-
auto k_max = std::sqrt(SQR(gnx1 / 2) + SQR(gnx2 / 2) + SQR(gnx3 / 2));
701-
Real mysum = 0;
702-
parthenon::HostArray1D<Real> spec("spectrum", static_cast<int>(std::ceil(k_max)) + 1);
703-
for (int k = outbox.low[2]; k <= outbox.high[2]; k++)
704-
for (int j = outbox.low[1]; j <= outbox.high[1]; j++)
705-
for (int i = outbox.low[0]; i <= outbox.high[0]; i++) {
696+
const auto k_max = std::sqrt(SQR(gnx1 / 2) + SQR(gnx2 / 2) + SQR(gnx3 / 2));
697+
698+
const auto num_bins = static_cast<int>(std::ceil(k_max)) + 1;
699+
// TODO(pgrete) if these are being reused, then ensure to reset (i.e., init 0 to and
700+
// call .reset())
701+
ParArray2D<Real> spectra("spectra", num_bins, 3);
702+
// temp view for reduction for better performance (switches
703+
// between atomics and data duplication depending on the platform)
704+
auto scatter_spectra =
705+
Kokkos::Experimental::ScatterView<Real **, parthenon::LayoutWrapper>(
706+
spectra.KokkosView());
707+
708+
ib.s = outbox.low[0];
709+
ib.e = outbox.high[0];
710+
jb.s = outbox.low[1];
711+
jb.e = outbox.high[1];
712+
kb.s = outbox.low[2];
713+
kb.e = outbox.high[2];
714+
const auto fft_size_outbox = fft.size_outbox();
715+
parthenon::par_for(
716+
"CalcSpec", kb.s, kb.e, jb.s, jb.e, ib.s, ib.e,
717+
KOKKOS_LAMBDA(const int k, const int j, const int i) {
706718
auto k_z = k <= gnx3 / 2 ? k : -gnx3 + k;
707719
auto k_y = j <= gnx2 / 2 ? j : -gnx2 + j;
708720
auto k_x = i; // because we're using r2c transforms
709721

710722
// for simple binning/indexing
711-
auto k_mag =
712-
static_cast<int>(std::floor(std::sqrt(SQR(k_x) + SQR(k_y) + SQR(k_z))));
723+
auto k_mag = std::sqrt(SQR(k_x) + SQR(k_y) + SQR(k_z));
724+
auto k_mag_int = static_cast<int>(std::floor(k_mag));
713725

714-
const auto outidx = ((k - outbox.low[2]) * outbox.size[1] + (j - outbox.low[1])) *
715-
outbox.size[0] +
716-
i - outbox.low[0];
726+
const auto outidx =
727+
((k - kb.s) * (jb.e - jb.s + 1) + (j - jb.s)) * (ib.e - ib.s + 1) + i - ib.s;
717728

718729
auto val = SQR(std::abs(output[outidx])) +
719-
SQR(std::abs(output[outidx + fft.size_outbox()])) +
720-
SQR(std::abs(output[outidx + 2 * fft.size_outbox()]));
730+
SQR(std::abs(output[outidx + fft_size_outbox])) +
731+
SQR(std::abs(output[outidx + 2 * fft_size_outbox]));
732+
721733
// account for Hermitian symmetry of r2c transform
722-
if ((k_x > 0) && (2 * k_x != gnx1)) {
723-
val *= 2.0;
724-
}
734+
const auto fac = ((k_x > 0) && (2 * k_x != gnx1)) ? 2.0 : 1.0;
725735

726-
spec(k_mag) += val;
727-
if (k_mag > 0.0) {
728-
mysum += val;
729-
} else {
730-
std::cerr << "[" << parthenon::Globals::my_rank
731-
<< "] mode 0 is: " << std::sqrt(val) << "\n";
732-
}
733-
}
736+
auto spec = scatter_spectra.access();
737+
// 0: histsum - 1: ksum - 2: histcount
738+
spec(k_mag_int, 0) += fac * val;
739+
spec(k_mag_int, 1) += fac * k_mag;
740+
spec(k_mag_int, 2) += fac * 1.0;
741+
});
742+
Kokkos::Experimental::contribute(spectra.KokkosView(), scatter_spectra);
734743

735-
Kokkos::fence();
736-
std::cerr << "[" << parthenon::Globals::my_rank
737-
<< "] my sum is: " << mysum / (gnx3 * gnx2 * gnx1) << " and the realsum is "
738-
<< realsum << "\n";
744+
Kokkos::fence(); // May not be required.
739745
#ifdef MPI_PARALLEL
740746
// Sum the perturbations over all processors
741747
if (parthenon::Globals::my_rank == 0) {
742-
PARTHENON_MPI_CHECK(MPI_Reduce(MPI_IN_PLACE, spec.data(), spec.size(),
748+
PARTHENON_MPI_CHECK(MPI_Reduce(MPI_IN_PLACE, spectra.data(), spectra.size(),
743749
MPI_PARTHENON_REAL, MPI_SUM, 0, MPI_COMM_WORLD));
744750
} else {
745-
PARTHENON_MPI_CHECK(MPI_Reduce(spec.data(), spec.data(), spec.size(),
751+
PARTHENON_MPI_CHECK(MPI_Reduce(spectra.data(), spectra.data(), spectra.size(),
746752
MPI_PARTHENON_REAL, MPI_SUM, 0, MPI_COMM_WORLD));
747753
}
748754
#endif // MPI_PARALLEL
749755

750756
if (parthenon::Globals::my_rank == 0) {
751-
757+
auto spectra_h = spectra.GetHostMirrorAndCopy();
752758
// and write data
753759
std::ofstream outfile;
754760
const std::string fname("spec.csv");
755761
// On startup, write header
756762
if (tm.ncycle == 0) {
757763
outfile.open(fname, std::ofstream::out);
758-
outfile << "# cycle, time, pos spec,...\n";
764+
outfile << "# cycle, time, num_bins, pos spec,...\n";
759765
} else {
760766
outfile.open(fname, std::ofstream::out | std::ofstream::app);
761767
}
762768

763-
outfile << tm.ncycle << "," << tm.time;
764-
for (int i = 0; i < spec.size(); i++) {
765-
outfile << "," << spec(i);
769+
outfile << tm.ncycle << "," << tm.time << "," << num_bins;
770+
for (int i = 0; i < spectra_h.size(); i++) {
771+
outfile << "," << spectra_h(i);
766772
}
767773
outfile << std::endl;
768774

0 commit comments

Comments
 (0)