@@ -226,25 +226,36 @@ namespace amrex
226226
227227 // ! example: auto mf = amrex::cast<MultiFab>(imf);
228228 template <typename T, typename U>
229- T cast (U const & mf_in)
230- {
231- T mf_out (mf_in.boxArray (), mf_in.DistributionMap (), mf_in.nComp (), mf_in.nGrowVect ());
229+ T cast (U const & mf_in);
232230
233- #ifdef AMREX_USE_OMP
234- #pragma omp parallel if (Gpu::notInLaunchRegion())
235- #endif
236- for (MFIter mfi (mf_in); mfi.isValid (); ++mfi)
237- {
238- const Long n = mfi.fabbox ().numPts () * mf_in.nComp ();
239- auto * pdst = mf_out[mfi].dataPtr ();
240- auto const * psrc = mf_in [mfi].dataPtr ();
241- AMREX_HOST_DEVICE_PARALLEL_FOR_1D ( n, i,
242- {
243- pdst[i] = static_cast <typename U::value_type>(psrc[i]); // NOLINT(bugprone-signed-char-misuse)
244- });
245- }
246- return mf_out;
247- }
231+ /* *
232+ * \brief Returns part of a norm based on two MultiFabs.
233+ *
234+ * The MultiFabs MUST have the same underlying BoxArray.
235+ * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
236+ * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n).
237+ */
238+ template <typename F>
239+ Real NormHelper (const MultiFab& x, int xcomp,
240+ const MultiFab& y, int ycomp,
241+ F && f,
242+ int numcomp, IntVect nghost, bool local);
243+
244+ /* *
245+ * \brief Returns part of a norm based on three MultiFabs.
246+ *
247+ * The MultiFabs MUST have the same underlying BoxArray.
248+ * The Predicate pf is used to test the mask
249+ * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
250+ * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n)
251+ */
252+ template <typename MMF, typename Pred, typename F>
253+ Real NormHelper (const MMF& mask,
254+ const MultiFab& x, int xcomp,
255+ const MultiFab& y, int ycomp,
256+ Pred && pf,
257+ F && f,
258+ int numcomp, IntVect nghost, bool local);
248259
249260 /* *
250261 * \brief Reduce FabArray/MultiFab data to a plane.
@@ -621,140 +632,6 @@ void average_down (const FabArray<FAB>& S_fine, FabArray<FAB>& S_crse,
621632 }
622633}
623634
624-
625-
626-
627-
628- /* *
629- * \brief Returns part of a norm based on two MultiFabs
630- * The MultiFabs MUST have the same underlying BoxArray.
631- * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
632- * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n)
633- */
634-
635- template <typename F>
636- Real
637- NormHelper (const MultiFab& x, int xcomp,
638- const MultiFab& y, int ycomp,
639- F && f,
640- int numcomp, IntVect nghost, bool local)
641- {
642- BL_ASSERT (x.boxArray () == y.boxArray ());
643- BL_ASSERT (x.DistributionMap () == y.DistributionMap ());
644- BL_ASSERT (x.nGrowVect ().allGE (nghost) && y.nGrowVect ().allGE (nghost));
645-
646- Real sm = Real (0.0 );
647- #ifdef AMREX_USE_GPU
648- if (Gpu::inLaunchRegion ()) {
649- auto const & xma = x.const_arrays ();
650- auto const & yma = y.const_arrays ();
651- sm = ParReduce (TypeList<ReduceOpSum>{}, TypeList<Real>{}, x, nghost,
652- [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<Real>
653- {
654- Real t = Real (0.0 );
655- auto const & xfab = xma[box_no];
656- auto const & yfab = yma[box_no];
657- for (int n = 0 ; n < numcomp; ++n) {
658- t += f (xfab (i,j,k,xcomp+n) , yfab (i,j,k,ycomp+n));
659- }
660- return t;
661- });
662- } else
663- #endif
664- {
665- #ifdef AMREX_USE_OMP
666- #pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
667- #endif
668- for (MFIter mfi (x,true ); mfi.isValid (); ++mfi)
669- {
670- Box const & bx = mfi.growntilebox (nghost);
671- Array4<Real const > const & xfab = x.const_array (mfi);
672- Array4<Real const > const & yfab = y.const_array (mfi);
673- AMREX_LOOP_4D (bx, numcomp, i, j, k, n,
674- {
675- sm += f (xfab (i,j,k,xcomp+n) , yfab (i,j,k,ycomp+n));
676- });
677- }
678- }
679-
680- if (!local) {
681- ParallelAllReduce::Sum (sm, ParallelContext::CommunicatorSub ());
682- }
683-
684- return sm;
685- }
686-
687- /* *
688- * \brief Returns part of a norm based on three MultiFabs
689- * The MultiFabs MUST have the same underlying BoxArray.
690- * The Predicate pf is used to test the mask
691- * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
692- * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n)
693- */
694-
695- template <typename MMF, typename Pred, typename F>
696- Real
697- NormHelper (const MMF& mask,
698- const MultiFab& x, int xcomp,
699- const MultiFab& y, int ycomp,
700- Pred && pf,
701- F && f,
702- int numcomp, IntVect nghost, bool local)
703- {
704- BL_ASSERT (x.boxArray () == y.boxArray ());
705- BL_ASSERT (x.boxArray () == mask.boxArray ());
706- BL_ASSERT (x.DistributionMap () == y.DistributionMap ());
707- BL_ASSERT (x.DistributionMap () == mask.DistributionMap ());
708- BL_ASSERT (x.nGrowVect ().allGE (nghost) && y.nGrowVect ().allGE (nghost));
709- BL_ASSERT (mask.nGrowVect ().allGE (nghost));
710-
711- Real sm = Real (0.0 );
712- #ifdef AMREX_USE_GPU
713- if (Gpu::inLaunchRegion ()) {
714- auto const & xma = x.const_arrays ();
715- auto const & yma = y.const_arrays ();
716- auto const & mma = mask.const_arrays ();
717- sm = ParReduce (TypeList<ReduceOpSum>{}, TypeList<Real>{}, x, nghost,
718- [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<Real>
719- {
720- Real t = Real (0.0 );
721- if (pf (mma[box_no](i,j,k))) {
722- auto const & xfab = xma[box_no];
723- auto const & yfab = yma[box_no];
724- for (int n = 0 ; n < numcomp; ++n) {
725- t += f (xfab (i,j,k,xcomp+n) , yfab (i,j,k,ycomp+n));
726- }
727- }
728- return t;
729- });
730- } else
731- #endif
732- {
733- #ifdef AMREX_USE_OMP
734- #pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
735- #endif
736- for (MFIter mfi (x,true ); mfi.isValid (); ++mfi)
737- {
738- Box const & bx = mfi.growntilebox (nghost);
739- Array4<Real const > const & xfab = x.const_array (mfi);
740- Array4<Real const > const & yfab = y.const_array (mfi);
741- auto const & mfab = mask.const_array (mfi);
742- AMREX_LOOP_4D (bx, numcomp, i, j, k, n,
743- {
744- if (pf (mfab (i,j,k))) {
745- sm += f (xfab (i,j,k,xcomp+n) , yfab (i,j,k,ycomp+n));
746- }
747- });
748- }
749- }
750-
751- if (!local) {
752- ParallelAllReduce::Sum (sm, ParallelContext::CommunicatorSub ());
753- }
754-
755- return sm;
756- }
757-
758635template <typename CMF, typename FMF,
759636 std::enable_if_t <IsFabArray_v<CMF> && IsFabArray_v<FMF>, int > FOO>
760637void average_face_to_cellcenter (CMF& cc, int dcomp,
@@ -1017,6 +894,140 @@ MF get_line_data (MF const& mf, int dir, IntVect const& cell)
1017894 }
1018895}
1019896
897+ template <typename T, typename U>
898+ T cast (U const & mf_in)
899+ {
900+ T mf_out (mf_in.boxArray (), mf_in.DistributionMap (), mf_in.nComp (), mf_in.nGrowVect ());
901+
902+ #ifdef AMREX_USE_OMP
903+ #pragma omp parallel if (Gpu::notInLaunchRegion())
904+ #endif
905+ for (MFIter mfi (mf_in); mfi.isValid (); ++mfi)
906+ {
907+ const Long n = mfi.fabbox ().numPts () * mf_in.nComp ();
908+ auto * pdst = mf_out[mfi].dataPtr ();
909+ auto const * psrc = mf_in [mfi].dataPtr ();
910+ AMREX_HOST_DEVICE_PARALLEL_FOR_1D ( n, i,
911+ {
912+ pdst[i] = static_cast <typename U::value_type>(psrc[i]); // NOLINT(bugprone-signed-char-misuse)
913+ });
914+ }
915+ return mf_out;
916+ }
917+
918+ template <typename F>
919+ Real NormHelper (const MultiFab& x, int xcomp,
920+ const MultiFab& y, int ycomp,
921+ F && f,
922+ int numcomp, IntVect nghost, bool local)
923+ {
924+ BL_ASSERT (x.boxArray () == y.boxArray ());
925+ BL_ASSERT (x.DistributionMap () == y.DistributionMap ());
926+ BL_ASSERT (x.nGrowVect ().allGE (nghost) && y.nGrowVect ().allGE (nghost));
927+
928+ Real sm = Real (0.0 );
929+ #ifdef AMREX_USE_GPU
930+ if (Gpu::inLaunchRegion ()) {
931+ auto const & xma = x.const_arrays ();
932+ auto const & yma = y.const_arrays ();
933+ sm = ParReduce (TypeList<ReduceOpSum>{}, TypeList<Real>{}, x, nghost,
934+ [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<Real>
935+ {
936+ Real t = Real (0.0 );
937+ auto const & xfab = xma[box_no];
938+ auto const & yfab = yma[box_no];
939+ for (int n = 0 ; n < numcomp; ++n) {
940+ t += f (xfab (i,j,k,xcomp+n) , yfab (i,j,k,ycomp+n));
941+ }
942+ return t;
943+ });
944+ } else
945+ #endif
946+ {
947+ #ifdef AMREX_USE_OMP
948+ #pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
949+ #endif
950+ for (MFIter mfi (x,true ); mfi.isValid (); ++mfi)
951+ {
952+ Box const & bx = mfi.growntilebox (nghost);
953+ Array4<Real const > const & xfab = x.const_array (mfi);
954+ Array4<Real const > const & yfab = y.const_array (mfi);
955+ AMREX_LOOP_4D (bx, numcomp, i, j, k, n,
956+ {
957+ sm += f (xfab (i,j,k,xcomp+n) , yfab (i,j,k,ycomp+n));
958+ });
959+ }
960+ }
961+
962+ if (!local) {
963+ ParallelAllReduce::Sum (sm, ParallelContext::CommunicatorSub ());
964+ }
965+
966+ return sm;
967+ }
968+
969+ template <typename MMF, typename Pred, typename F>
970+ Real NormHelper (const MMF& mask,
971+ const MultiFab& x, int xcomp,
972+ const MultiFab& y, int ycomp,
973+ Pred && pf,
974+ F && f,
975+ int numcomp, IntVect nghost, bool local)
976+ {
977+ BL_ASSERT (x.boxArray () == y.boxArray ());
978+ BL_ASSERT (x.boxArray () == mask.boxArray ());
979+ BL_ASSERT (x.DistributionMap () == y.DistributionMap ());
980+ BL_ASSERT (x.DistributionMap () == mask.DistributionMap ());
981+ BL_ASSERT (x.nGrowVect ().allGE (nghost) && y.nGrowVect ().allGE (nghost));
982+ BL_ASSERT (mask.nGrowVect ().allGE (nghost));
983+
984+ Real sm = Real (0.0 );
985+ #ifdef AMREX_USE_GPU
986+ if (Gpu::inLaunchRegion ()) {
987+ auto const & xma = x.const_arrays ();
988+ auto const & yma = y.const_arrays ();
989+ auto const & mma = mask.const_arrays ();
990+ sm = ParReduce (TypeList<ReduceOpSum>{}, TypeList<Real>{}, x, nghost,
991+ [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<Real>
992+ {
993+ Real t = Real (0.0 );
994+ if (pf (mma[box_no](i,j,k))) {
995+ auto const & xfab = xma[box_no];
996+ auto const & yfab = yma[box_no];
997+ for (int n = 0 ; n < numcomp; ++n) {
998+ t += f (xfab (i,j,k,xcomp+n) , yfab (i,j,k,ycomp+n));
999+ }
1000+ }
1001+ return t;
1002+ });
1003+ } else
1004+ #endif
1005+ {
1006+ #ifdef AMREX_USE_OMP
1007+ #pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
1008+ #endif
1009+ for (MFIter mfi (x,true ); mfi.isValid (); ++mfi)
1010+ {
1011+ Box const & bx = mfi.growntilebox (nghost);
1012+ Array4<Real const > const & xfab = x.const_array (mfi);
1013+ Array4<Real const > const & yfab = y.const_array (mfi);
1014+ auto const & mfab = mask.const_array (mfi);
1015+ AMREX_LOOP_4D (bx, numcomp, i, j, k, n,
1016+ {
1017+ if (pf (mfab (i,j,k))) {
1018+ sm += f (xfab (i,j,k,xcomp+n) , yfab (i,j,k,ycomp+n));
1019+ }
1020+ });
1021+ }
1022+ }
1023+
1024+ if (!local) {
1025+ ParallelAllReduce::Sum (sm, ParallelContext::CommunicatorSub ());
1026+ }
1027+
1028+ return sm;
1029+ }
1030+
10201031template <typename Op, typename T, typename FAB, typename F,
10211032 std::enable_if_t <IsBaseFab<FAB>::value
10221033#ifndef AMREX_USE_CUDA
0 commit comments