Skip to content

Commit b039119

Browse files
authored
Merge pull request #124 from pdziekan/rc_optimisation
Rc optimisation
2 parents 1b0f9fe + 01a2eb7 commit b039119

File tree

10 files changed

+189
-167
lines changed

10 files changed

+189
-167
lines changed

src/detail/exec_timer.hpp

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,10 @@ class exec_timer : public solver_t
5454
parent_t::hook_ante_loop(nt);
5555
this->mem->barrier();
5656
if (this->rank == 0)
57+
{
5758
tbeg_loop = parent_t::clock::now();
59+
trecord_all = parent_t::timer::zero(); // reset to 0, because we only want record all done in loop, not the one in ante_loop
60+
}
5861
this->mem->barrier();
5962
}
6063

@@ -86,11 +89,17 @@ class exec_timer : public solver_t
8689
tend_loop = parent_t::clock::now();
8790
tloop = std::chrono::duration_cast<std::chrono::milliseconds>( tend_loop - tbeg_loop );
8891

92+
// calculate CPU/GPU times and concurrency, valid only for async runs and not taking into account diagnostics in record_all
93+
typename parent_t::timer tsync_in = parent_t::tsync,
94+
tgpu = parent_t::tasync_wait_in_record_all + parent_t::tsync_wait + parent_t::tasync_wait + tsync_in, // time of pure GPU calculations (= wait time of CPU)
95+
tcpugpu = tsync_in + parent_t::tasync_gpu + parent_t::tsync_gpu - tgpu, // time of concurrent CPU and GPU calculations (= total time of GPU calculations - tgpu)
96+
tcpu = tloop - tgpu - tcpugpu;
97+
8998
std::cout << "wall time in milliseconds: " << std::endl
9099
<< "loop: " << tloop.count() << std::endl
91100
<< " hook_ante_step: " << thas.count() << " ("<< setup::real_t(thas.count())/tloop.count()*100 <<"%)" << std::endl
92-
<< " async_wait: " << parent_t::tasync_wait.count() << " ("<< setup::real_t(parent_t::tasync_wait.count())/tloop.count()*100 <<"%)" << std::endl
93101
<< " hook_mixed_rhs_ante_step: " << thmas.count() << " ("<< setup::real_t(thmas.count())/tloop.count()*100 <<"%)" << std::endl
102+
<< " async_wait: " << parent_t::tasync_wait.count() << " ("<< setup::real_t(parent_t::tasync_wait.count())/tloop.count()*100 <<"%)" << std::endl
94103
<< " sync: " << parent_t::tsync.count() << " ("<< setup::real_t(parent_t::tsync.count())/tloop.count()*100 <<"%)" << std::endl
95104
<< " step: " << thas_hads.count() << " ("<< setup::real_t(thas_hads.count())/tloop.count()*100 <<"%)" << std::endl
96105
<< " hook_ante_delayed_step: " << thads.count() << " ("<< setup::real_t(thads.count())/tloop.count()*100 <<"%)" << std::endl
@@ -99,9 +108,18 @@ class exec_timer : public solver_t
99108
<< " delayed step: " << thads_hps.count() << " ("<< setup::real_t(thads_hps.count())/tloop.count()*100 <<"%)" << std::endl
100109
<< " hook_post_step: " << thps.count() << " ("<< setup::real_t(thps.count())/tloop.count()*100 <<"%)" << std::endl
101110
<< " hook_mixed_rhs_post_step: " << thmps.count() << " ("<< setup::real_t(thmps.count())/tloop.count()*100 <<"%)" << std::endl
102-
<< " record_all: " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl
111+
<< " record_all (in loop): " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl
103112
<< " async_wait in record_all: " << parent_t::tasync_wait_in_record_all.count() << " ("<< setup::real_t(parent_t::tasync_wait_in_record_all.count())/tloop.count()*100 <<"%)" << std::endl
104113
<< " hook_post_step->hook_ante_step: " << thps_has.count() << " ("<< setup::real_t(thps_has.count())/tloop.count()*100 <<"%)" << std::endl;
114+
115+
std::cout << std::endl
116+
<< "CPU/GPU concurrency stats, only make sense for async lgrngn runs" << std::endl
117+
<< "and does not take into account GPU time in record_all, so most accurate without diag:" << std::endl
118+
<< " pure CPU calculations: " << tcpu.count() << " ("<< setup::real_t(tcpu.count())/tloop.count()*100 <<"%)" << std::endl
119+
<< " pure GPU calculations: " << tgpu.count() << " ("<< setup::real_t(tgpu.count())/tloop.count()*100 <<"%)" << std::endl
120+
<< " concurrent CPU&GPU: " << tcpugpu.count() << " ("<< setup::real_t(tcpugpu.count())/tloop.count()*100 <<"%)" << std::endl
121+
<< " tsync_gpu: " << parent_t::tsync_gpu.count() << " ("<< setup::real_t(parent_t::tsync_gpu.count())/tloop.count()*100 <<"%)" << std::endl
122+
<< " tasync_gpu: " << parent_t::tasync_gpu.count() << " ("<< setup::real_t(parent_t::tasync_gpu.count())/tloop.count()*100 <<"%)" << std::endl;
105123
}
106124
}
107125
}

src/detail/func_time.hpp

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// function that calculates execution time of any other member function called via ptr
2+
#pragma once
3+
4+
// async_forwarder code taken from https://kholdstare.github.io/technical/2012/12/18/perfect-forwarding-to-async-2.html (C) Alexander Kondratskiy
5+
// it is used to pass any type of reference (lvalue or rvalue) to std::async
6+
template <typename T>
7+
class async_forwarder
8+
{
9+
// Store value directly
10+
T val_;
11+
12+
public:
13+
/**
14+
* Move an rvalue of T into the wrapper,
15+
* incurring no copies.
16+
*/
17+
async_forwarder(T&& t)
18+
: val_(std::move(t)) { }
19+
20+
// ensure no copies are made
21+
async_forwarder(async_forwarder const& other) = delete;
22+
23+
// move constructor
24+
async_forwarder(async_forwarder&& other)
25+
: val_(std::move(other.val_)) { }
26+
27+
// Move the value out.
28+
// Note: can only occur once!
29+
operator T&& () { return std::move(val_); }
30+
operator T&& () const { return std::move(val_); }
31+
};
32+
33+
// This particular specialization
34+
// is essentially std::ref
35+
template <typename T>
36+
class async_forwarder<T&>
37+
{
38+
T& val_;
39+
40+
public:
41+
/**
42+
* Wrap the reference when passed an lvalue reference,
43+
* to fool std::async
44+
*/
45+
async_forwarder(T& t) : val_(t) { }
46+
47+
// ensure no copies are made
48+
async_forwarder(async_forwarder const& other) = delete;
49+
50+
// move constructor
51+
async_forwarder(async_forwarder&& other)
52+
: val_(other.val_) { }
53+
54+
// User-defined conversion that automatically
55+
// converts to the appropriate type
56+
operator T& () { return val_; }
57+
operator T const& () const { return val_; }
58+
};
59+
60+
61+
#if defined(UWLCM_TIMING)
62+
template<class clock, class timer, class F, class ptr, typename... Args>
63+
timer func_time(F func, ptr p, Args&&... args){
64+
auto t1=clock::now();
65+
(p->*func)(std::forward<Args>(args)...);
66+
return std::chrono::duration_cast<timer>(clock::now()-t1);
67+
}
68+
#else
69+
template<class clock, class timer, class F, class ptr, typename... Args>
70+
timer func_time(F func, ptr p, Args&&... args){
71+
(p->*func)(std::forward<Args>(args)...);
72+
return timer();
73+
}
74+
#endif
75+
76+
template<class clock, class timer, class F, class ptr, typename... Args>
77+
std::future<timer> async_timing_launcher(F func, ptr p, Args&&... args) // func and p are pointers, so their copies are lightweight
78+
{
79+
return std::async(
80+
std::launch::async,
81+
func_time<clock, timer, F, ptr, Args...>,
82+
func,
83+
p,
84+
async_forwarder<Args>(std::forward<Args>(args))... // ATTENTION! args are passed by reference to async
85+
);
86+
}

src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#if defined(STD_FUTURE_WORKS)
44
# include <future>
55
#endif
6+
#include "../../detail/func_time.hpp"
67

78
template <class ct_params_t>
89
void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
@@ -19,7 +20,11 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
1920
#if defined(UWLCM_TIMING)
2021
tbeg = parent_t::clock::now();
2122
#endif
23+
#if defined(UWLCM_TIMING)
24+
parent_t::tsync_gpu += ftr.get();
25+
#else
2226
ftr.get();
27+
#endif
2328
#if defined(UWLCM_TIMING)
2429
tend = parent_t::clock::now();
2530
parent_t::tsync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
@@ -48,6 +53,8 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
4853

4954
// store liquid water content (post-cond, pre-adve and pre-subsidence)
5055
diag_rl();
56+
if(ct_params_t::sgs_scheme == libmpdataxx::solvers::smg)
57+
diag_rc();
5158

5259
if (this->rank == 0)
5360
{
@@ -56,6 +63,7 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
5663
using libcloudphxx::lgrngn::particles_t;
5764
using libcloudphxx::lgrngn::CUDA;
5865
using libcloudphxx::lgrngn::multi_CUDA;
66+
using timer = typename parent_t::timer;
5967
#if defined(UWLCM_TIMING)
6068
tbeg = parent_t::clock::now();
6169
#endif
@@ -64,15 +72,13 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
6472
{
6573
assert(!ftr.valid());
6674
if(params.backend == CUDA)
67-
ftr = std::async(
68-
std::launch::async,
75+
ftr = async_timing_launcher<typename parent_t::clock, timer>(
6976
&particles_t<real_t, CUDA>::step_async,
7077
dynamic_cast<particles_t<real_t, CUDA>*>(prtcls.get()),
7178
params.cloudph_opts
7279
);
7380
else if(params.backend == multi_CUDA)
74-
ftr = std::async(
75-
std::launch::async,
81+
ftr = async_timing_launcher<typename parent_t::clock, timer>(
7682
&particles_t<real_t, multi_CUDA>::step_async,
7783
dynamic_cast<particles_t<real_t, multi_CUDA>*>(prtcls.get()),
7884
params.cloudph_opts
@@ -106,9 +112,18 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
106112
this->vert_grad_cnt(tmp1, F, params.dz);
107113
F(ijk).reindex(this->zero) *= - (*params.w_LS)(this->vert_idx);
108114
r_l(ijk) += F(ijk) * this->dt;
115+
116+
tmp1(ijk) = r_c(ijk);
117+
// fill halos for gradient calculation
118+
// TODO: no need to xchng in horizontal, which potentially causes MPI communication
119+
this->xchng_sclr(tmp1, this->ijk, this->halo);
120+
this->vert_grad_cnt(tmp1, F, params.dz);
121+
F(ijk).reindex(this->zero) *= - (*params.w_LS)(this->vert_idx);
122+
r_c(ijk) += F(ijk) * this->dt;
109123
}
110124

111-
// advect r_l (1st-order)
125+
// advect r_l and r_c (1st-order)
112126
this->self_advec_donorcell(this->r_l);
127+
this->self_advec_donorcell(this->r_c);
113128
negcheck(this->mem->advectee(ix::rv)(this->ijk), "rv at the end of ante delayed step");
114129
}

src/solvers/lgrngn/hook_ante_step_lgrngn.hpp

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,28 +7,6 @@
77
template <class ct_params_t>
88
void slvr_lgrngn<ct_params_t>::hook_ante_step()
99
{
10-
if (this->rank == 0)
11-
{
12-
// assuring previous async step finished ...
13-
#if defined(STD_FUTURE_WORKS)
14-
if (
15-
params.async &&
16-
this->timestep != 0 && // ... but not in first timestep ...
17-
((this->timestep ) % this->outfreq != 0) // ... and not after diag call, note: timestep is updated after ante_step
18-
) {
19-
assert(ftr.valid());
20-
#if defined(UWLCM_TIMING)
21-
tbeg = parent_t::clock::now();
22-
#endif
23-
ftr.get();
24-
#if defined(UWLCM_TIMING)
25-
tend = parent_t::clock::now();
26-
parent_t::tasync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
27-
#endif
28-
} else assert(!ftr.valid());
29-
#endif
30-
}
31-
this->mem->barrier();
3210
parent_t::hook_ante_step(); // includes RHS, which in turn launches sync_in and step_cond
3311
negcheck(this->mem->advectee(ix::rv)(this->ijk), "rv after at the end of hook_ante_step");
3412
}

src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#pragma once
22
#include "../slvr_lgrngn.hpp"
3+
#include "../../detail/func_time.hpp"
34
#if defined(STD_FUTURE_WORKS)
45
# include <future>
56
#endif
@@ -33,6 +34,29 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
3334
Cz.reindex(this->zero) /= (*params.rhod)(this->vert_idx); // TODO: should be interpolated, since theres a shift between positions of rhod and Cz
3435
}
3536

37+
// assuring previous async step finished ...
38+
#if defined(STD_FUTURE_WORKS)
39+
if (
40+
params.async &&
41+
this->timestep != 0 && // ... but not in first timestep ...
42+
((this->timestep ) % this->outfreq != 0) // ... and not after diag call, note: timestep is updated after ante_step
43+
) {
44+
assert(ftr.valid());
45+
#if defined(UWLCM_TIMING)
46+
tbeg = parent_t::clock::now();
47+
#endif
48+
#if defined(UWLCM_TIMING)
49+
parent_t::tasync_gpu += ftr.get();
50+
#else
51+
ftr.get();
52+
#endif
53+
#if defined(UWLCM_TIMING)
54+
tend = parent_t::clock::now();
55+
parent_t::tasync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
56+
#endif
57+
} else assert(!ftr.valid());
58+
#endif
59+
3660
// start synchronous stuff timer
3761
#if defined(UWLCM_TIMING)
3862
tbeg = parent_t::clock::now();
@@ -62,8 +86,7 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
6286
{
6387
assert(!ftr.valid());
6488
if(params.backend == CUDA)
65-
ftr = std::async(
66-
std::launch::async,
89+
ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
6790
&particles_t<real_t, CUDA>::step_cond,
6891
dynamic_cast<particles_t<real_t, CUDA>*>(prtcls.get()),
6992
params.cloudph_opts,
@@ -72,8 +95,7 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
7295
std::map<enum libcloudphxx::common::chem::chem_species_t, libcloudphxx::lgrngn::arrinfo_t<real_t> >()
7396
);
7497
else if(params.backend == multi_CUDA)
75-
ftr = std::async(
76-
std::launch::async,
98+
ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
7799
&particles_t<real_t, multi_CUDA>::step_cond,
78100
dynamic_cast<particles_t<real_t, multi_CUDA>*>(prtcls.get()),
79101
params.cloudph_opts,

src/solvers/slvr_common.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@ class slvr_common : public slvr_dim<ct_params_t>
2525
protected:
2626

2727
#if defined(UWLCM_TIMING)
28-
2928
using clock = std::chrono::system_clock;
3029
using timer = std::chrono::milliseconds;
31-
timer tsync, tsync_wait, tasync, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn
32-
33-
protected:
30+
timer tsync, tsync_gpu, tsync_wait, tasync, tasync_gpu, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn
31+
#else
32+
using timer = void;
33+
using clock = void;
3434
#endif
3535

3636
int spinup; // number of timesteps

0 commit comments

Comments
 (0)