Move cast and NormHelper

eebasso · eebasso · commit 4290dbb47bdf · 2023-11-24T12:56:07.000-08:00
diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H
@@ -226,25 +226,36 @@ namespace amrex
 
     //! example: auto mf = amrex::cast<MultiFab>(imf);
     template <typename T, typename U>
-    T cast (U const& mf_in)
-    {
-        T mf_out(mf_in.boxArray(), mf_in.DistributionMap(), mf_in.nComp(), mf_in.nGrowVect());
+    T cast (U const& mf_in);
 
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
-        for (MFIter mfi(mf_in); mfi.isValid(); ++mfi)
-        {
-            const Long n = mfi.fabbox().numPts() * mf_in.nComp();
-            auto      * pdst = mf_out[mfi].dataPtr();
-            auto const* psrc = mf_in [mfi].dataPtr();
-            AMREX_HOST_DEVICE_PARALLEL_FOR_1D ( n, i,
-            {
-                pdst[i] = static_cast<typename U::value_type>(psrc[i]); // NOLINT(bugprone-signed-char-misuse)
-            });
-        }
-        return mf_out;
-    }
+    /**
+     * \brief Returns part of a norm based on two MultiFabs.
+     *
+     * The MultiFabs MUST have the same underlying BoxArray.
+     * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
+     * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n).
+     */
+    template <typename F>
+    Real NormHelper (const MultiFab& x, int xcomp,
+                     const MultiFab& y, int ycomp,
+                     F && f,
+                     int numcomp, IntVect nghost, bool local);
+
+    /**
+     * \brief Returns part of a norm based on three MultiFabs.
+     *
+     * The MultiFabs MUST have the same underlying BoxArray.
+     * The Predicate pf is used to test the mask
+     * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
+     * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n)
+     */
+    template <typename MMF, typename Pred, typename F>
+    Real NormHelper (const MMF& mask,
+                     const MultiFab& x, int xcomp,
+                     const MultiFab& y, int ycomp,
+                     Pred && pf,
+                     F && f,
+                     int numcomp, IntVect nghost, bool local);
 
     /**
      * \brief Reduce FabArray/MultiFab data to a plane.
@@ -621,140 +632,6 @@ void average_down (const FabArray<FAB>& S_fine, FabArray<FAB>& S_crse,
     }
 }
 
-
-
-
-
-   /**
-    * \brief Returns part of a norm based on two MultiFabs
-    * The MultiFabs MUST have the same underlying BoxArray.
-    * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
-    * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n)
-    */
-
-template <typename F>
-Real
-NormHelper (const MultiFab& x, int xcomp,
-            const MultiFab& y, int ycomp,
-            F && f,
-            int numcomp, IntVect nghost, bool local)
-{
-    BL_ASSERT(x.boxArray() == y.boxArray());
-    BL_ASSERT(x.DistributionMap() == y.DistributionMap());
-    BL_ASSERT(x.nGrowVect().allGE(nghost) && y.nGrowVect().allGE(nghost));
-
-    Real sm = Real(0.0);
-#ifdef AMREX_USE_GPU
-    if (Gpu::inLaunchRegion()) {
-        auto const& xma = x.const_arrays();
-        auto const& yma = y.const_arrays();
-        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, x, nghost,
-        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<Real>
-        {
-            Real t = Real(0.0);
-            auto const& xfab = xma[box_no];
-            auto const& yfab = yma[box_no];
-            for (int n = 0; n < numcomp; ++n) {
-                t += f(xfab(i,j,k,xcomp+n) , yfab(i,j,k,ycomp+n));
-            }
-            return t;
-        });
-    } else
-#endif
-    {
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
-#endif
-        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
-        {
-            Box const& bx = mfi.growntilebox(nghost);
-            Array4<Real const> const& xfab = x.const_array(mfi);
-            Array4<Real const> const& yfab = y.const_array(mfi);
-            AMREX_LOOP_4D(bx, numcomp, i, j, k, n,
-            {
-                sm += f(xfab(i,j,k,xcomp+n) , yfab(i,j,k,ycomp+n));
-            });
-        }
-    }
-
-    if (!local) {
-        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
-    }
-
-    return sm;
-}
-
-   /**
-    * \brief Returns part of a norm based on three MultiFabs
-    * The MultiFabs MUST have the same underlying BoxArray.
-    * The Predicate pf is used to test the mask
-    * The function f is applied elementwise as f(x(i,j,k,n),y(i,j,k,n))
-    * inside the summation (subject to a valid mask entry pf(mask(i,j,k,n)
-    */
-
-template <typename MMF, typename Pred, typename F>
-Real
-NormHelper (const MMF& mask,
-               const MultiFab& x, int xcomp,
-               const MultiFab& y, int ycomp,
-               Pred && pf,
-               F && f,
-               int numcomp, IntVect nghost, bool local)
-{
-    BL_ASSERT(x.boxArray() == y.boxArray());
-    BL_ASSERT(x.boxArray() == mask.boxArray());
-    BL_ASSERT(x.DistributionMap() == y.DistributionMap());
-    BL_ASSERT(x.DistributionMap() == mask.DistributionMap());
-    BL_ASSERT(x.nGrowVect().allGE(nghost) && y.nGrowVect().allGE(nghost));
-    BL_ASSERT(mask.nGrowVect().allGE(nghost));
-
-    Real sm = Real(0.0);
-#ifdef AMREX_USE_GPU
-    if (Gpu::inLaunchRegion()) {
-        auto const& xma = x.const_arrays();
-        auto const& yma = y.const_arrays();
-        auto const& mma = mask.const_arrays();
-        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, x, nghost,
-        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<Real>
-        {
-            Real t = Real(0.0);
-            if (pf(mma[box_no](i,j,k))) {
-                auto const& xfab = xma[box_no];
-                auto const& yfab = yma[box_no];
-                for (int n = 0; n < numcomp; ++n) {
-                    t += f(xfab(i,j,k,xcomp+n) , yfab(i,j,k,ycomp+n));
-                }
-            }
-            return t;
-        });
-    } else
-#endif
-    {
-#ifdef AMREX_USE_OMP
-#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
-#endif
-        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
-        {
-            Box const& bx = mfi.growntilebox(nghost);
-            Array4<Real const> const& xfab = x.const_array(mfi);
-            Array4<Real const> const& yfab = y.const_array(mfi);
-            auto const& mfab = mask.const_array(mfi);
-            AMREX_LOOP_4D(bx, numcomp, i, j, k, n,
-            {
-                if (pf(mfab(i,j,k))) {
-                    sm += f(xfab(i,j,k,xcomp+n) , yfab(i,j,k,ycomp+n));
-                }
-            });
-        }
-    }
-
-    if (!local) {
-        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
-    }
-
-    return sm;
-}
-
 template <typename CMF, typename FMF,
           std::enable_if_t<IsFabArray_v<CMF> && IsFabArray_v<FMF>, int> FOO>
 void average_face_to_cellcenter (CMF& cc, int dcomp,
@@ -1017,6 +894,140 @@ MF get_line_data (MF const& mf, int dir, IntVect const& cell)
     }
 }
 
+template <typename T, typename U>
+T cast (U const& mf_in)
+{
+    T mf_out(mf_in.boxArray(), mf_in.DistributionMap(), mf_in.nComp(), mf_in.nGrowVect());
+
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(mf_in); mfi.isValid(); ++mfi)
+    {
+        const Long n = mfi.fabbox().numPts() * mf_in.nComp();
+        auto      * pdst = mf_out[mfi].dataPtr();
+        auto const* psrc = mf_in [mfi].dataPtr();
+        AMREX_HOST_DEVICE_PARALLEL_FOR_1D ( n, i,
+        {
+            pdst[i] = static_cast<typename U::value_type>(psrc[i]); // NOLINT(bugprone-signed-char-misuse)
+        });
+    }
+    return mf_out;
+}
+
+template <typename F>
+Real NormHelper (const MultiFab& x, int xcomp,
+                 const MultiFab& y, int ycomp,
+                 F && f,
+                 int numcomp, IntVect nghost, bool local)
+{
+    BL_ASSERT(x.boxArray() == y.boxArray());
+    BL_ASSERT(x.DistributionMap() == y.DistributionMap());
+    BL_ASSERT(x.nGrowVect().allGE(nghost) && y.nGrowVect().allGE(nghost));
+
+    Real sm = Real(0.0);
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        auto const& xma = x.const_arrays();
+        auto const& yma = y.const_arrays();
+        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, x, nghost,
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<Real>
+        {
+            Real t = Real(0.0);
+            auto const& xfab = xma[box_no];
+            auto const& yfab = yma[box_no];
+            for (int n = 0; n < numcomp; ++n) {
+                t += f(xfab(i,j,k,xcomp+n) , yfab(i,j,k,ycomp+n));
+            }
+            return t;
+        });
+    } else
+#endif
+    {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
+#endif
+        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.growntilebox(nghost);
+            Array4<Real const> const& xfab = x.const_array(mfi);
+            Array4<Real const> const& yfab = y.const_array(mfi);
+            AMREX_LOOP_4D(bx, numcomp, i, j, k, n,
+            {
+                sm += f(xfab(i,j,k,xcomp+n) , yfab(i,j,k,ycomp+n));
+            });
+        }
+    }
+
+    if (!local) {
+        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
+    }
+
+    return sm;
+}
+
+template <typename MMF, typename Pred, typename F>
+Real NormHelper (const MMF& mask,
+                 const MultiFab& x, int xcomp,
+                 const MultiFab& y, int ycomp,
+                 Pred && pf,
+                 F && f,
+                 int numcomp, IntVect nghost, bool local)
+{
+    BL_ASSERT(x.boxArray() == y.boxArray());
+    BL_ASSERT(x.boxArray() == mask.boxArray());
+    BL_ASSERT(x.DistributionMap() == y.DistributionMap());
+    BL_ASSERT(x.DistributionMap() == mask.DistributionMap());
+    BL_ASSERT(x.nGrowVect().allGE(nghost) && y.nGrowVect().allGE(nghost));
+    BL_ASSERT(mask.nGrowVect().allGE(nghost));
+
+    Real sm = Real(0.0);
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        auto const& xma = x.const_arrays();
+        auto const& yma = y.const_arrays();
+        auto const& mma = mask.const_arrays();
+        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<Real>{}, x, nghost,
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<Real>
+        {
+            Real t = Real(0.0);
+            if (pf(mma[box_no](i,j,k))) {
+                auto const& xfab = xma[box_no];
+                auto const& yfab = yma[box_no];
+                for (int n = 0; n < numcomp; ++n) {
+                    t += f(xfab(i,j,k,xcomp+n) , yfab(i,j,k,ycomp+n));
+                }
+            }
+            return t;
+        });
+    } else
+#endif
+    {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
+#endif
+        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.growntilebox(nghost);
+            Array4<Real const> const& xfab = x.const_array(mfi);
+            Array4<Real const> const& yfab = y.const_array(mfi);
+            auto const& mfab = mask.const_array(mfi);
+            AMREX_LOOP_4D(bx, numcomp, i, j, k, n,
+            {
+                if (pf(mfab(i,j,k))) {
+                    sm += f(xfab(i,j,k,xcomp+n) , yfab(i,j,k,ycomp+n));
+                }
+            });
+        }
+    }
+
+    if (!local) {
+        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
+    }
+
+    return sm;
+}
+
 template <typename Op, typename T, typename FAB, typename F,
           std::enable_if_t<IsBaseFab<FAB>::value
 #ifndef AMREX_USE_CUDA