igfuw
diff --git a/‎src/detail/exec_timer.hpp‎
Lines changed: 20 additions & 2 deletions b/‎src/detail/exec_timer.hpp‎
Lines changed: 20 additions & 2 deletions
diff --git a/‎src/detail/func_time.hpp‎
Lines changed: 86 additions & 0 deletions b/‎src/detail/func_time.hpp‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp‎
Lines changed: 20 additions & 5 deletions b/‎src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp‎
Lines changed: 20 additions & 5 deletions
diff --git a/‎src/solvers/lgrngn/hook_ante_step_lgrngn.hpp‎
Lines changed: 0 additions & 22 deletions b/‎src/solvers/lgrngn/hook_ante_step_lgrngn.hpp‎
Lines changed: 0 additions & 22 deletions
diff --git a/‎src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp‎
Lines changed: 26 additions & 4 deletions b/‎src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp‎
Lines changed: 26 additions & 4 deletions
diff --git a/‎src/solvers/slvr_common.hpp‎
Lines changed: 4 additions & 4 deletions b/‎src/solvers/slvr_common.hpp‎
Lines changed: 4 additions & 4 deletions
@@ -54,7 +54,10 @@ class exec_timer : public solver_t
     parent_t::hook_ante_loop(nt);
     this->mem->barrier();
     if (this->rank == 0) 
+    {   
       tbeg_loop = parent_t::clock::now();
+      trecord_all = parent_t::timer::zero(); // reset to 0, because we only want record all done in loop, not the one in ante_loop 
+    }
     this->mem->barrier();
   }
 
@@ -86,11 +89,17 @@ class exec_timer : public solver_t
         tend_loop = parent_t::clock::now();
         tloop = std::chrono::duration_cast<std::chrono::milliseconds>( tend_loop - tbeg_loop );
 
+        // calculate CPU/GPU times and concurrency, valid only for async runs and not taking into account diagnostics in record_all
+        typename parent_t::timer tsync_in = parent_t::tsync,
+                                 tgpu = parent_t::tasync_wait_in_record_all + parent_t::tsync_wait + parent_t::tasync_wait + tsync_in, // time of pure GPU calculations (= wait time of CPU)
+                                 tcpugpu = tsync_in + parent_t::tasync_gpu + parent_t::tsync_gpu - tgpu, // time of concurrent CPU and GPU calculations (= total time of GPU calculations - tgpu)
+                                 tcpu = tloop - tgpu - tcpugpu;
+         
         std::cout <<  "wall time in milliseconds: " << std::endl
           << "loop:                            " << tloop.count() << std::endl
           << "  hook_ante_step:                  " << thas.count() << " ("<< setup::real_t(thas.count())/tloop.count()*100 <<"%)" << std::endl
-          << "    async_wait:                      " << parent_t::tasync_wait.count() << " ("<< setup::real_t(parent_t::tasync_wait.count())/tloop.count()*100 <<"%)" << std::endl
           << "    hook_mixed_rhs_ante_step:        " << thmas.count() << " ("<< setup::real_t(thmas.count())/tloop.count()*100 <<"%)" << std::endl
+          << "      async_wait:                    " << parent_t::tasync_wait.count() << " ("<< setup::real_t(parent_t::tasync_wait.count())/tloop.count()*100 <<"%)" << std::endl
           << "      sync:                            " << parent_t::tsync.count() << " ("<< setup::real_t(parent_t::tsync.count())/tloop.count()*100 <<"%)" << std::endl
           << "  step:                            " << thas_hads.count() << " ("<< setup::real_t(thas_hads.count())/tloop.count()*100 <<"%)" << std::endl
           << "  hook_ante_delayed_step:          " << thads.count() << " ("<< setup::real_t(thads.count())/tloop.count()*100 <<"%)" << std::endl
@@ -99,9 +108,18 @@ class exec_timer : public solver_t
           << "  delayed step:                    " << thads_hps.count() << " ("<< setup::real_t(thads_hps.count())/tloop.count()*100 <<"%)" << std::endl
           << "  hook_post_step:                  " << thps.count() << " ("<< setup::real_t(thps.count())/tloop.count()*100 <<"%)" << std::endl
           << "    hook_mixed_rhs_post_step:        " << thmps.count() << " ("<< setup::real_t(thmps.count())/tloop.count()*100 <<"%)" << std::endl
-          << "    record_all:                      " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl
+          << "    record_all (in loop):            " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl
           << "      async_wait in record_all:      " << parent_t::tasync_wait_in_record_all.count() << " ("<< setup::real_t(parent_t::tasync_wait_in_record_all.count())/tloop.count()*100 <<"%)" << std::endl
           << "  hook_post_step->hook_ante_step:  " << thps_has.count() << " ("<< setup::real_t(thps_has.count())/tloop.count()*100 <<"%)" << std::endl;
+
+          std::cout << std::endl
+          << "CPU/GPU concurrency stats, only make sense for async lgrngn runs" << std::endl
+          << "and does not take into account GPU time in record_all, so most accurate without diag:" << std::endl
+          << "  pure CPU calculations: " << tcpu.count() << " ("<< setup::real_t(tcpu.count())/tloop.count()*100 <<"%)" << std::endl
+          << "  pure GPU calculations: " << tgpu.count() << " ("<< setup::real_t(tgpu.count())/tloop.count()*100 <<"%)" << std::endl
+          << "  concurrent CPU&GPU:    " << tcpugpu.count() << " ("<< setup::real_t(tcpugpu.count())/tloop.count()*100 <<"%)" << std::endl
+          << "  tsync_gpu:  " << parent_t::tsync_gpu.count() << " ("<< setup::real_t(parent_t::tsync_gpu.count())/tloop.count()*100 <<"%)" << std::endl
+          << "  tasync_gpu: " << parent_t::tasync_gpu.count() << " ("<< setup::real_t(parent_t::tasync_gpu.count())/tloop.count()*100 <<"%)" << std::endl;
       }
     }
   }
 
@@ -0,0 +1,86 @@
+// function that calculates execution time of any other member function called via ptr
+#pragma once
+
+// async_forwarder code taken from https://kholdstare.github.io/technical/2012/12/18/perfect-forwarding-to-async-2.html (C) Alexander Kondratskiy
+// it is used to pass any type of reference (lvalue or rvalue) to std::async
+template <typename T>
+class async_forwarder
+{
+    // Store value directly
+    T val_;
+
+public:
+    /**
+     * Move an rvalue of T into the wrapper,
+     * incurring no copies.
+     */
+    async_forwarder(T&& t) 
+     : val_(std::move(t)) { }
+
+    // ensure no copies are made
+    async_forwarder(async_forwarder const& other) = delete;
+
+    // move constructor
+    async_forwarder(async_forwarder&& other)
+        : val_(std::move(other.val_)) { }
+
+    // Move the value out.
+    // Note: can only occur once!
+    operator T&& ()       { return std::move(val_); }
+    operator T&& () const { return std::move(val_); }
+};
+
+// This particular specialization
+// is essentially std::ref
+template <typename T>
+class async_forwarder<T&>
+{
+    T& val_;
+
+public:
+    /**
+     * Wrap the reference when passed an lvalue reference,
+     * to fool std::async
+     */
+    async_forwarder(T& t) : val_(t) { }
+
+    // ensure no copies are made
+    async_forwarder(async_forwarder const& other) = delete;
+
+    // move constructor
+    async_forwarder(async_forwarder&& other)
+        : val_(other.val_) { }
+
+    // User-defined conversion that automatically
+    // converts to the appropriate type
+    operator T&       ()       { return val_; }
+    operator T const& () const { return val_; }
+};
+
+
+#if defined(UWLCM_TIMING)
+  template<class clock, class timer, class F, class ptr, typename... Args>
+  timer func_time(F func, ptr p, Args&&... args){
+    auto t1=clock::now();
+    (p->*func)(std::forward<Args>(args)...);
+    return std::chrono::duration_cast<timer>(clock::now()-t1);
+  }
+#else
+  template<class clock, class timer, class F, class ptr, typename... Args>
+  timer func_time(F func, ptr p, Args&&... args){
+    (p->*func)(std::forward<Args>(args)...);
+    return timer();
+  }
+#endif
+
+template<class clock, class timer, class F, class ptr, typename... Args>
+std::future<timer> async_timing_launcher(F func, ptr p, Args&&... args) // func and p are pointers, so their copies are lightweight
+{
+  return std::async(
+           std::launch::async,
+           func_time<clock, timer, F, ptr, Args...>,
+           func, 
+           p,
+           async_forwarder<Args>(std::forward<Args>(args))... // ATTENTION! args are passed by reference to async
+         );
+}
@@ -3,6 +3,7 @@
 #if defined(STD_FUTURE_WORKS)
 #  include <future>
 #endif
+#include "../../detail/func_time.hpp"
 
 template <class ct_params_t>
 void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
@@ -19,7 +20,11 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
 #if defined(UWLCM_TIMING)
       tbeg = parent_t::clock::now();
 #endif
+#if defined(UWLCM_TIMING)
+      parent_t::tsync_gpu += ftr.get();
+#else
       ftr.get();
+#endif
 #if defined(UWLCM_TIMING)
       tend = parent_t::clock::now();
       parent_t::tsync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
@@ -48,6 +53,8 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
 
   // store liquid water content (post-cond, pre-adve and pre-subsidence)
   diag_rl();
+  if(ct_params_t::sgs_scheme == libmpdataxx::solvers::smg)
+    diag_rc();
 
   if (this->rank == 0) 
   {
@@ -56,6 +63,7 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
       using libcloudphxx::lgrngn::particles_t;
       using libcloudphxx::lgrngn::CUDA;
       using libcloudphxx::lgrngn::multi_CUDA;
+      using timer = typename parent_t::timer;
 #if defined(UWLCM_TIMING)
       tbeg = parent_t::clock::now();
 #endif
@@ -64,15 +72,13 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
       {
         assert(!ftr.valid());
         if(params.backend == CUDA)
-          ftr = std::async(
-            std::launch::async, 
+          ftr = async_timing_launcher<typename parent_t::clock, timer>(
             &particles_t<real_t, CUDA>::step_async, 
             dynamic_cast<particles_t<real_t, CUDA>*>(prtcls.get()),
             params.cloudph_opts
           );
         else if(params.backend == multi_CUDA)
-          ftr = std::async(
-            std::launch::async, 
+          ftr = async_timing_launcher<typename parent_t::clock, timer>(
             &particles_t<real_t, multi_CUDA>::step_async, 
             dynamic_cast<particles_t<real_t, multi_CUDA>*>(prtcls.get()),
             params.cloudph_opts
@@ -106,9 +112,18 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
     this->vert_grad_cnt(tmp1, F, params.dz);
     F(ijk).reindex(this->zero) *= - (*params.w_LS)(this->vert_idx);
     r_l(ijk) += F(ijk) * this->dt;
+
+    tmp1(ijk) = r_c(ijk);
+    // fill halos for gradient calculation
+    // TODO: no need to xchng in horizontal, which potentially causes MPI communication
+    this->xchng_sclr(tmp1, this->ijk, this->halo);
+    this->vert_grad_cnt(tmp1, F, params.dz);
+    F(ijk).reindex(this->zero) *= - (*params.w_LS)(this->vert_idx);
+    r_c(ijk) += F(ijk) * this->dt;
   }
 
-  // advect r_l (1st-order)
+  // advect r_l and r_c (1st-order)
   this->self_advec_donorcell(this->r_l);
+  this->self_advec_donorcell(this->r_c);
   negcheck(this->mem->advectee(ix::rv)(this->ijk), "rv at the end of ante delayed step");
 }
@@ -7,28 +7,6 @@
 template <class ct_params_t>
 void slvr_lgrngn<ct_params_t>::hook_ante_step()
 {
-  if (this->rank == 0) 
-  {
-    // assuring previous async step finished ...
-#if defined(STD_FUTURE_WORKS)
-    if (
-      params.async && 
-      this->timestep != 0 && // ... but not in first timestep ...
-      ((this->timestep ) % this->outfreq != 0) // ... and not after diag call, note: timestep is updated after ante_step
-    ) {
-      assert(ftr.valid());
-#if defined(UWLCM_TIMING)
-      tbeg = parent_t::clock::now();
-#endif
-      ftr.get();
-#if defined(UWLCM_TIMING)
-      tend = parent_t::clock::now();
-      parent_t::tasync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
-#endif
-    } else assert(!ftr.valid()); 
-#endif
-  }
-  this->mem->barrier();
   parent_t::hook_ante_step(); // includes RHS, which in turn launches sync_in and step_cond
   negcheck(this->mem->advectee(ix::rv)(this->ijk), "rv after at the end of hook_ante_step");
 }
@@ -1,5 +1,6 @@
 #pragma once
 #include "../slvr_lgrngn.hpp"
+#include "../../detail/func_time.hpp"
 #if defined(STD_FUTURE_WORKS)
 #  include <future>
 #endif
@@ -33,6 +34,29 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
       Cz.reindex(this->zero) /= (*params.rhod)(this->vert_idx); // TODO: should be interpolated, since theres a shift between positions of rhod and Cz
     }
 
+    // assuring previous async step finished ...
+#if defined(STD_FUTURE_WORKS)
+    if (
+      params.async && 
+      this->timestep != 0 && // ... but not in first timestep ...
+      ((this->timestep ) % this->outfreq != 0) // ... and not after diag call, note: timestep is updated after ante_step
+    ) {
+      assert(ftr.valid());
+#if defined(UWLCM_TIMING)
+      tbeg = parent_t::clock::now();
+#endif
+#if defined(UWLCM_TIMING)
+      parent_t::tasync_gpu += ftr.get();
+#else
+      ftr.get();
+#endif
+#if defined(UWLCM_TIMING)
+      tend = parent_t::clock::now();
+      parent_t::tasync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
+#endif
+    } else assert(!ftr.valid()); 
+#endif
+
     // start synchronous stuff timer
 #if defined(UWLCM_TIMING)
     tbeg = parent_t::clock::now();
@@ -62,8 +86,7 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
     {
       assert(!ftr.valid());
       if(params.backend == CUDA)
-        ftr = std::async(
-          std::launch::async, 
+        ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
           &particles_t<real_t, CUDA>::step_cond, 
           dynamic_cast<particles_t<real_t, CUDA>*>(prtcls.get()),
           params.cloudph_opts,
@@ -72,8 +95,7 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
           std::map<enum libcloudphxx::common::chem::chem_species_t, libcloudphxx::lgrngn::arrinfo_t<real_t> >()
         );
       else if(params.backend == multi_CUDA)
-        ftr = std::async(
-          std::launch::async, 
+        ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
           &particles_t<real_t, multi_CUDA>::step_cond, 
           dynamic_cast<particles_t<real_t, multi_CUDA>*>(prtcls.get()),
           params.cloudph_opts,
 
@@ -25,12 +25,12 @@ class slvr_common : public slvr_dim<ct_params_t>
   protected:
 
 #if defined(UWLCM_TIMING)
-
   using clock = std::chrono::system_clock; 
   using timer = std::chrono::milliseconds; 
-  timer tsync, tsync_wait, tasync, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn
-
-  protected:
+  timer tsync, tsync_gpu, tsync_wait, tasync, tasync_gpu, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn
+#else
+  using timer = void; 
+  using clock = void; 
 #endif
 
   int spinup; // number of timesteps