BLAST-WarpX · RemiLehe · Mar 16, 2022 · Mar 16, 2022 · Mar 17, 2022 · Jun 9, 2022
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -105,8 +105,8 @@ jobs:
         export LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:${LD_LIBRARY_PATH}
         which nvcc || echo "nvcc not in PATH!"
 
-        git clone https://github.com/AMReX-Codes/amrex.git ../amrex
-        cd amrex && git checkout --detach 18d0a2861d31c52c65752a1d5856f54e08699ce3 && cd -
+        git clone https://github.com/kngott/amrex.git ../amrex
+        cd amrex && git checkout --detach kngott/graphviz && cd -
         make COMP=gcc QED=FALSE USE_MPI=TRUE USE_GPU=TRUE USE_OMP=FALSE USE_PSATD=TRUE USE_CCACHE=TRUE -j 2
 
   build_nvhpc21-11-nvcc:

diff --git a/Regression/WarpX-GPU-tests.ini b/Regression/WarpX-GPU-tests.ini
@@ -60,7 +60,7 @@ emailBody = Check https://ccse.lbl.gov/pub/GpuRegressionTesting/WarpX/ for more
 
 [AMReX]
 dir = /home/regtester/git/amrex/
-branch = 18d0a2861d31c52c65752a1d5856f54e08699ce3
+branch = kngott/graphviz
 
 [source]
 dir = /home/regtester/git/WarpX

diff --git a/Regression/WarpX-tests.ini b/Regression/WarpX-tests.ini
@@ -59,7 +59,7 @@ emailBody = Check https://ccse.lbl.gov/pub/RegressionTesting/WarpX/ for more det
 
 [AMReX]
 dir = /home/regtester/AMReX_RegTesting/amrex/
-branch = 18d0a2861d31c52c65752a1d5856f54e08699ce3
+branch = kngott/graphviz
 
 [source]
 dir = /home/regtester/AMReX_RegTesting/warpx

diff --git a/Source/BoundaryConditions/PML.cpp b/Source/BoundaryConditions/PML.cpp
@@ -44,6 +44,7 @@
 #include <AMReX_RealVect.H>
 #include <AMReX_SPACE.H>
 #include <AMReX_VisMF.H>
+#include <AMReX_Graph.H> // kngott/graphviz
 
 #include <algorithm>
 #include <cmath>
@@ -1184,13 +1185,30 @@ PML::Exchange (MultiFab& pml, MultiFab& reg, const Geometry& geom,
         MultiFab::Add(totpmlmf,pml,2,0,1,0); // Sum the third split component
     }
 
+    // record metrics of this comm op in a graph before and after load balance steps
+    //auto & warpx = WarpX::GetInstance();
+    //auto const & lb_intervals = warpx.load_balance_intervals;
+    //auto const cur_step = warpx.istep[0];
+    //auto & graph = warpx.graph;
+
     // Copy from the sum of PML split field to valid cells of regular grid
     if (do_pml_in_domain){
         // Valid cells of the PML and of the regular grid overlap
         // Copy from valid cells of the PML to valid cells of the regular grid
         ablastr::utils::communication::ParallelCopy(reg, totpmlmf, 0, 0, 1, IntVect(0), IntVect(0),
                                                     WarpX::do_single_precision_comms,
                                                     period);
+
+        // record metrics of this comm op in a graph before and after load balance steps
+        /*
+        if (lb_intervals.contains(cur_step+2) || // before LB
+            lb_intervals.contains(cur_step+1)    // after LB
+        )
+        {
+            graph.addParallelCopy("PML-in-domain-comm", "tmpregmf", "totpmlmf", 0.0,
+                                  reg, totpmlmf, 0, 0, 1, IntVect(0), ngr, period);
+            //graph.print_table("comm_data");
+        }*/
     } else {
         // Valid cells of the PML only overlap with guard cells of regular grid
         // (and outermost valid cell of the regular grid, for nodal direction)
@@ -1201,6 +1219,18 @@ PML::Exchange (MultiFab& pml, MultiFab& reg, const Geometry& geom,
             ablastr::utils::communication::ParallelCopy(tmpregmf, totpmlmf, 0, 0, 1, IntVect(0), ngr,
                                    WarpX::do_single_precision_comms,
                                                         period);
+
+            // record metrics of this comm op in a graph before and after load balance steps
+            /*
+            if (lb_intervals.contains(cur_step+2) || // before LB
+                lb_intervals.contains(cur_step+1)    // after LB
+            )
+            {
+                graph.addParallelCopy("PML-comm", "tmpregmf", "totpmlmf", 0.0,
+                                      tmpregmf, totpmlmf, 0, 0, 1, IntVect(0), ngr, period);
+                //graph.print_table("comm_data");
+            }*/
+
 #ifdef AMREX_USE_OMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif

diff --git a/Source/Parallelization/WarpXRegrid.cpp b/Source/Parallelization/WarpXRegrid.cpp
@@ -37,27 +37,28 @@
 #include <AMReX_Vector.H>
 #include <AMReX_iMultiFab.H>
 
+#include <AMReX_Graph.H> // kngott/graphviz
+
 #include <algorithm>
 #include <array>
 #include <cmath>
 #include <cstddef>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 using namespace amrex;
 
 void
-WarpX::LoadBalance ()
-{
+WarpX::LoadBalance () {
     WARPX_PROFILE_REGION("LoadBalance");
     WARPX_PROFILE("WarpX::LoadBalance()");
 
     AMREX_ALWAYS_ASSERT(costs[0] != nullptr);
 
 #ifdef AMREX_USE_MPI
-    if (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::Heuristic)
-    {
+    if (load_balance_costs_update_algo == LoadBalanceCostsUpdateAlgo::Heuristic) {
         // compute the costs on a per-rank basis
         ComputeCostsHeuristic(costs);
     }
@@ -67,56 +68,51 @@ WarpX::LoadBalance ()
     int loadBalancedAnyLevel = false;
 
     const int nLevels = finestLevel();
-    for (int lev = 0; lev <= nLevels; ++lev)
-    {
+    for (int lev = 0; lev <= nLevels; ++lev) {
         int doLoadBalance = false;
 
         // Compute the new distribution mapping
         DistributionMapping newdm;
         const amrex::Real nboxes = costs[lev]->size();
         const amrex::Real nprocs = ParallelContext::NProcsSub();
-        const int nmax = static_cast<int>(std::ceil(nboxes/nprocs*load_balance_knapsack_factor));
+        const int nmax = static_cast<int>(std::ceil(nboxes / nprocs * load_balance_knapsack_factor));
         // These store efficiency (meaning, the  average 'cost' over all ranks,
         // normalized to max cost) for current and proposed distribution mappings
         amrex::Real currentEfficiency = 0.0;
         amrex::Real proposedEfficiency = 0.0;
 
         newdm = (load_balance_with_sfc)
-            ? DistributionMapping::makeSFC(*costs[lev],
-                                           currentEfficiency, proposedEfficiency,
-                                           false,
-                                           ParallelDescriptor::IOProcessorNumber())
-            : DistributionMapping::makeKnapSack(*costs[lev],
-                                                currentEfficiency, proposedEfficiency,
-                                                nmax,
-                                                false,
-                                                ParallelDescriptor::IOProcessorNumber());
+                ? DistributionMapping::makeSFC(*costs[lev],
+                                               currentEfficiency, proposedEfficiency,
+                                               false,
+                                               ParallelDescriptor::IOProcessorNumber())
+                : DistributionMapping::makeKnapSack(*costs[lev],
+                                                    currentEfficiency, proposedEfficiency,
+                                                    nmax,
+                                                    false,
+                                                    ParallelDescriptor::IOProcessorNumber());
         // As specified in the above calls to makeSFC and makeKnapSack, the new
         // distribution mapping is NOT communicated to all ranks; the loadbalanced
         // dm is up-to-date only on root, and we can decide whether to broadcast
         if ((load_balance_efficiency_ratio_threshold > 0.0)
-            && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()))
-        {
-            doLoadBalance = (proposedEfficiency > load_balance_efficiency_ratio_threshold*currentEfficiency);
+            && (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())) {
+            doLoadBalance = (proposedEfficiency >
+                             load_balance_efficiency_ratio_threshold * currentEfficiency);
         }
 
         ParallelDescriptor::Bcast(&doLoadBalance, 1,
                                   ParallelDescriptor::IOProcessorNumber());
 
-        if (doLoadBalance)
-        {
+        if (doLoadBalance) {
             Vector<int> pmap;
-            if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber())
-            {
+            if (ParallelDescriptor::MyProc() == ParallelDescriptor::IOProcessorNumber()) {
                 pmap = newdm.ProcessorMap();
-            } else
-            {
+            } else {
                 pmap.resize(static_cast<std::size_t>(nboxes));
             }
             ParallelDescriptor::Bcast(pmap.data(), pmap.size(), ParallelDescriptor::IOProcessorNumber());
 
-            if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber())
-            {
+            if (ParallelDescriptor::MyProc() != ParallelDescriptor::IOProcessorNumber()) {
                 newdm = DistributionMapping(pmap);
             }
 
@@ -128,6 +124,77 @@ WarpX::LoadBalance ()
 
         loadBalancedAnyLevel = loadBalancedAnyLevel || doLoadBalance;
     }
+
+    // record metrics of costs in a graph at balance steps
+    amrex::Graph graph;
+    //{
+    // loadBalancedAnyLevel
+    // currentEfficiency
+    // proposedEfficiency
+
+    // load balance costs
+    for (int lev = 0; lev <= finest_level; ++lev) {
+        std::string name = "costs_lev";
+        name.append(std::to_string(lev));
+        graph.addFab(*costs[lev], name, sizeof(amrex::Real));
+        std::vector<double> costs_local(costs[lev]->local_size());
+        for (int n=0; n<costs[lev]->local_size(); ++n)
+        {
+            costs_local[n] = costs[lev]->data()[n];
+        }
+        double const scaling = 1.0;
+        bool const available_locally = false; // not all available on one MPI rank, with respect to costs_local
+        graph.addNodeWeight(name, "cost_value", costs_local, scaling, available_locally);
+    }
+
+    // E and B filling patterns (from WarpXComm.cpp)
+    for (int lev = 0; lev <= finest_level; ++lev)
+    {
+        std::array<amrex::MultiFab *, 3> mf;
+        amrex::Periodicity period;
+        amrex::IntVect ng = guard_cells.ng_alloc_EB;
+        // no MR or fine level of MR
+        //if (patch_type == PatchType::fine)
+        //{
+        mf = {Efield_fp[lev][0].get(), Efield_fp[lev][1].get(), Efield_fp[lev][2].get()};
+        period = Geom(lev).periodicity();
+        //}
+        //else // coarse patch (part of MR)
+        //{
+        //    mf     = {Efield_cp[lev][0].get(), Efield_cp[lev][1].get(), Efield_cp[lev][2].get()};
+        //    period = Geom(lev-1).periodicity();
+        //}
+        int const i = 0; // just the Ex component
+        const amrex::IntVect nghost = (safe_guard_cells) ? mf[i]->nGrowVect() : ng;
+        //FillBoundary(*mf[i], nghost, WarpX::do_single_precision_comms, period, nodal_sync);
+        std::string mf_name = "Efield_fp_lvl";
+        mf_name.append(std::to_string(lev));
+        double const scaling = 1.0;
+
+        graph.addFillBoundary("FillBoundaryE",
+                              mf_name,
+                              scaling,
+                              *mf[i],
+                              nghost,
+                              period);
+
+    }
+
+    // PML comm patterns (TODO) - see PML.cpp
+    // with and without do_pml_in_domain
+    //{
+    //graph.addParallelCopy("PML-comm", "tmpregmf", "totpmlmf", 0.0,
+    //                      tmpregmf, totpmlmf, 0, 0, 1, IntVect(0), ngr, period);
+    //}
+
+    // Capture Number of Particles per Box (TODO)
+
+    // Capture Particle Comm Patterns (TODO)
+
+    std::string graph_dir_name = "comm_data_step";
+    graph_dir_name.append(std::to_string(istep[0]+1));
+    graph.print_table(graph_dir_name);
+
     if (loadBalancedAnyLevel)
     {
         mypc->Redistribute();

diff --git a/Source/WarpX.H b/Source/WarpX.H
@@ -58,6 +58,8 @@
 #include <AMReX_BaseFwd.H>
 #include <AMReX_AmrCoreFwd.H>
 
+#include <AMReX_Graph.H> // kngott/graphviz
+
 #include <array>
 #include <iostream>
 #include <limits>

diff --git a/cmake/dependencies/AMReX.cmake b/cmake/dependencies/AMReX.cmake
@@ -238,10 +238,10 @@ set(WarpX_amrex_src ""
     "Local path to AMReX source directory (preferred if set)")
 
 # Git fetcher
-set(WarpX_amrex_repo "https://github.com/AMReX-Codes/amrex.git"
+set(WarpX_amrex_repo "https://github.com/kngott/amrex.git"
     CACHE STRING
     "Repository URI to pull and build AMReX from if(WarpX_amrex_internal)")
-set(WarpX_amrex_branch "18d0a2861d31c52c65752a1d5856f54e08699ce3"
+set(WarpX_amrex_branch "kngott/graphviz"
     CACHE STRING
     "Repository branch for WarpX_amrex_repo if(WarpX_amrex_internal)")
 

diff --git a/run_test.sh b/run_test.sh
@@ -70,8 +70,8 @@ export SETUPTOOLS_USE_DISTUTILS="stdlib"
 python3 -m pip install --upgrade -r warpx/Regression/requirements.txt
 
 # Clone AMReX and warpx-data
-git clone https://github.com/AMReX-Codes/amrex.git
-cd amrex && git checkout --detach 18d0a2861d31c52c65752a1d5856f54e08699ce3 && cd -
+git clone https://github.com/kngott/amrex.git
+cd amrex && git checkout --detach kngott/graphviz && cd -
 # warpx-data contains various required data sets
 git clone --depth 1 https://github.com/ECP-WarpX/warpx-data.git