Skip to content

Commit 542d6ee

Browse files
client: maintain stats correctly when AM detaches/reattaches project
Science United maintains stats (CPU/GPU EC and time, job counts) for each (host, project) pair. The client reports these on each AM RPC; the RPC handler computes the differences and updates things. Problem: if SU detaches a project on a host, the project disappears from the host. If SU later reattaches it, the project will be created with all stats zero. When this is reported to SU, the previous stats will be lost. Solution: in the AM RPC reply, for projects not currently on the client but with a host_project record in the SU database, include the stats; in the client, parse these in the RPC reply and use them to initialize the new project.
1 parent a8629cf commit 542d6ee

3 files changed

Lines changed: 31 additions & 10 deletions

File tree

client/acct_mgr.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,12 @@ int AM_ACCOUNT::parse(XML_PARSER& xp) {
302302
resource_share.init();
303303
user_avg_ec = 0;
304304
user_total_ec = 0;
305+
cpu_ec = 0;
306+
cpu_time = 0;
307+
gpu_ec = 0;
308+
gpu_time = 0;
309+
njobs_success = 0;
310+
njobs_error = 0;
305311

306312
while (!xp.get_tag()) {
307313
if (!xp.is_tag) {
@@ -329,6 +335,12 @@ int AM_ACCOUNT::parse(XML_PARSER& xp) {
329335
if (xp.parse_bool("update", update)) continue;
330336
if (xp.parse_double("user_avg_ec", user_avg_ec)) continue;
331337
if (xp.parse_double("user_total_ec", user_total_ec)) continue;
338+
if (xp.parse_double("cpu_ec", cpu_ec)) continue;
339+
if (xp.parse_double("cpu_time", cpu_time)) continue;
340+
if (xp.parse_double("gpu_ec", gpu_ec)) continue;
341+
if (xp.parse_double("gpu_time", gpu_time)) continue;
342+
if (xp.parse_int("njobs_success", njobs_success)) continue;
343+
if (xp.parse_int("njobs_error", njobs_error)) continue;
332344
if (xp.parse_bool("no_cpu", btemp)) {
333345
handle_no_rsc("CPU", btemp);
334346
continue;
@@ -792,6 +804,12 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) {
792804
if (gstate.acct_mgr_info.dynamic) {
793805
pp->user_expavg_credit = acct.user_avg_ec;
794806
pp->user_total_credit = acct.user_total_ec;
807+
pp->cpu_ec = acct.cpu_ec;
808+
pp->cpu_time = acct.cpu_time;
809+
pp->gpu_ec = acct.gpu_ec;
810+
pp->gpu_time = acct.gpu_time;
811+
pp->njobs_success = acct.njobs_success;
812+
pp->njobs_error = acct.njobs_error;
795813
}
796814
} else {
797815
msg_printf(NULL, MSG_INTERNAL_ERROR,

client/acct_mgr.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,13 @@ struct AM_ACCOUNT {
161161
OPTIONAL_BOOL abort_not_started;
162162
double user_avg_ec;
163163
double user_total_ec;
164+
// the following present if client not already attached
165+
double cpu_ec;
166+
double cpu_time;
167+
double gpu_ec;
168+
double gpu_time;
169+
int njobs_success;
170+
int njobs_error;
164171

165172
void handle_no_rsc(const char*, bool);
166173
int parse(XML_PARSER&);

client/project.h

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -121,12 +121,14 @@ struct PROJECT : PROJ_AM {
121121
// and this should go to 1.
122122
// But we need to keep it around for older projects
123123

124-
// accounting info; estimated credit and time for CPU and GPU
124+
// accounting info for dynamic account managers;
125125
//
126-
double cpu_ec;
127-
double cpu_time;
126+
double cpu_ec; // estimated credit
127+
double cpu_time; // device/seconds
128128
double gpu_ec;
129129
double gpu_time;
130+
int njobs_success;
131+
int njobs_error;
130132

131133
// stuff related to scheduler RPCs and master fetch
132134
//
@@ -154,6 +156,7 @@ struct PROJECT : PROJ_AM {
154156
// computed by get_disk_usages()
155157
double disk_share;
156158
// computed by get_disk_shares();
159+
bool dont_use_dcf;
157160

158161
/////// END OF ITEMS STORED IN client_state.xml
159162

@@ -192,8 +195,6 @@ struct PROJECT : PROJ_AM {
192195
// if nonzero, send this project's job log from that point on
193196
bool send_full_workload;
194197

195-
bool dont_use_dcf;
196-
197198
bool suspended_via_gui;
198199
bool dont_request_more_work;
199200
// Return work, but don't request more
@@ -316,11 +317,6 @@ struct PROJECT : PROJ_AM {
316317
//
317318
APP_CONFIGS app_configs;
318319

319-
// job counting
320-
//
321-
int njobs_success;
322-
int njobs_error;
323-
324320
// total elapsed time of this project's jobs (for export to GUI)
325321
//
326322
double elapsed_time;

0 commit comments

Comments
 (0)