Skip to content

Commit 1fb7c5d

Browse files
committed
Collect performance metrics by using one MPI_Gather instead of multiple MPI_Reduce
1 parent 30ee2ca commit 1fb7c5d

File tree

1 file changed

+179
-93
lines changed

1 file changed

+179
-93
lines changed

src/cases/report_timing.cpp

Lines changed: 179 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,19 @@
1414
#include <stdlib.h>
1515
#include <string.h>
1616
#include <libgen.h> /* basename() */
17+
#include <limits.h> /* LONG_MAX */
1718
#include <assert.h>
1819

1920
#include <mpi.h>
2021
#include <e3sm_io.h>
2122

23+
typedef struct {
24+
MPI_Offset nreqs;
25+
MPI_Offset amount;
26+
MPI_Offset sum_decomp_varlen;
27+
double timing[7];
28+
} perf_metrics;
29+
2230
#ifdef ENABLE_PNC
2331
/*----< pnetcdf_check_mem_usage() >------------------------------------------*/
2432
/* check PnetCDF library internal memory usage */
@@ -62,12 +70,11 @@ int print_timing_WR(e3sm_io_config *cfg,
6270
e3sm_io_decom *decom,
6371
case_meta *cmeta)
6472
{
65-
int i, ndecomp;
66-
MPI_Offset off_msg[3], sum_off[3];
67-
MPI_Offset sum_nreqs, sum_amount_WR, max_nreqs, min_nreqs, max_wr, min_wr;
73+
int i, j, ndecomp;
6874
MPI_Offset vlen, sum_decomp_varlen;
69-
double wTime;
7075
MPI_Comm comm=cfg->io_comm;
76+
double ioTime;
77+
perf_metrics metrics, *all_metrics;
7178

7279
ndecomp = decom->num_decomp;
7380

@@ -90,31 +97,54 @@ int print_timing_WR(e3sm_io_config *cfg,
9097
* total size.
9198
*/
9299

93-
off_msg[0] = cmeta->my_nreqs;
94-
off_msg[1] = cmeta->amount_WR;
95-
off_msg[2] = sum_decomp_varlen;
96-
MPI_Reduce(off_msg, sum_off, 3, MPI_OFFSET, MPI_SUM, 0, comm);
97-
sum_nreqs = sum_off[0];
98-
sum_amount_WR = sum_off[1];
99-
sum_decomp_varlen = sum_off[2];
100-
101-
MPI_Reduce(&cmeta->my_nreqs, &max_nreqs, 1, MPI_OFFSET, MPI_MAX, 0, comm);
102-
MPI_Reduce(&cmeta->my_nreqs, &min_nreqs, 1, MPI_OFFSET, MPI_MIN, 0, comm);
103-
MPI_Reduce(&cmeta->amount_WR, &max_wr, 1, MPI_OFFSET, MPI_MAX, 0, comm);
104-
MPI_Reduce(&cmeta->amount_WR, &min_wr, 1, MPI_OFFSET, MPI_MIN, 0, comm);
105-
106-
double dbl_tmp[7], max_dbl[7], min_dbl[7];
107-
dbl_tmp[0] = cmeta->pre_time;
108-
dbl_tmp[1] = cmeta->open_time;
109-
dbl_tmp[2] = cmeta->def_time;
110-
dbl_tmp[3] = cmeta->post_time;
111-
dbl_tmp[4] = cmeta->flush_time;
112-
dbl_tmp[5] = cmeta->close_time;
113-
dbl_tmp[6] = cmeta->end2end_time;
114-
MPI_Reduce(dbl_tmp, max_dbl, 7, MPI_DOUBLE, MPI_MAX, 0, comm);
115-
MPI_Reduce(dbl_tmp, min_dbl, 7, MPI_DOUBLE, MPI_MIN, 0, comm);
100+
metrics.nreqs = cmeta->my_nreqs;
101+
metrics.amount = cmeta->amount_WR;
102+
metrics.sum_decomp_varlen = sum_decomp_varlen;
103+
metrics.timing[0] = cmeta->pre_time;
104+
metrics.timing[1] = cmeta->open_time;
105+
metrics.timing[2] = cmeta->def_time;
106+
metrics.timing[3] = cmeta->post_time;
107+
metrics.timing[4] = cmeta->flush_time;
108+
metrics.timing[5] = cmeta->close_time;
109+
metrics.timing[6] = cmeta->end2end_time;
110+
111+
if (cfg->rank == 0)
112+
all_metrics = (perf_metrics*) malloc(sizeof(perf_metrics) * cfg->np);
113+
114+
MPI_Gather(&metrics, sizeof(perf_metrics), MPI_BYTE,
115+
all_metrics, sizeof(perf_metrics), MPI_BYTE, 0, comm);
116116

117117
if (cfg->rank == 0) {
118+
MPI_Offset sum_nreqs, sum_amount, max_nreqs, min_nreqs, max_amnt, min_amnt;
119+
double max_t[7], min_t[7];
120+
121+
sum_nreqs = 0;
122+
sum_amount = 0;
123+
sum_decomp_varlen = 0;
124+
max_nreqs = 0;
125+
max_amnt = 0;
126+
min_nreqs = LONG_MAX;
127+
min_amnt = LONG_MAX;
128+
for (j=0; j<7; j++) {
129+
max_t[j] = 0;
130+
min_t[j] = (double)LONG_MAX;
131+
}
132+
133+
for (i=0; i<cfg->np; i++) {
134+
sum_nreqs += all_metrics[i].nreqs;
135+
sum_amount += all_metrics[i].amount;
136+
sum_decomp_varlen += all_metrics[i].sum_decomp_varlen;
137+
max_nreqs = MAX(max_nreqs, all_metrics[i].nreqs);
138+
max_amnt = MAX(max_amnt, all_metrics[i].amount);
139+
min_nreqs = MIN(min_nreqs, all_metrics[i].nreqs);
140+
min_amnt = MIN(min_amnt, all_metrics[i].amount);
141+
for (j=0; j<7; j++) {
142+
max_t[j] = MAX(max_t[j], all_metrics[i].timing[j]);
143+
min_t[j] = MIN(min_t[j], all_metrics[i].timing[j]);
144+
}
145+
}
146+
free(all_metrics);
147+
118148
int nvars_noD = cmeta->nvars;
119149
for (i=0; i<ndecomp; i++) nvars_noD -= cmeta->nvars_D[i];
120150

@@ -187,13 +217,13 @@ int print_timing_WR(e3sm_io_config *cfg,
187217
}
188218

189219
if (cfg->strategy == canonical && (cfg->api == netcdf4 || cfg->api == hdf5))
190-
wTime = max_dbl[3];
220+
ioTime = max_t[3];
191221
else if (cfg->strategy == log && cfg->api == netcdf4)
192-
wTime = max_dbl[5];
222+
ioTime = max_t[5];
193223
else if (cfg->api == pnetcdf || cfg->strategy == log || cfg->api == hdf5_md)
194-
wTime = max_dbl[4];
224+
ioTime = max_t[4];
195225
else /* write happens at file close for hdf5 blob and adios blob */
196-
wTime = max_dbl[5];
226+
ioTime = max_t[5];
197227

198228
if (cfg->strategy == log && (cfg->api == hdf5_log || cfg->api == netcdf4)) {
199229
if (cfg->num_subfiles != 0) {
@@ -247,24 +277,24 @@ int print_timing_WR(e3sm_io_config *cfg,
247277
printf("No. I/O flush calls = %6d\n", cmeta->num_flushes);
248278
printf("-----------------------------------------------------------\n");
249279
printf("Indiv write amount MAX = %.2f MiB = %.2f GiB\n",
250-
(double)max_wr / 1048576, (double)max_wr / 1073741824);
280+
(double)max_amnt / 1048576, (double)max_amnt / 1073741824);
251281
printf("Indiv write amount MIN = %.2f MiB = %.2f GiB\n",
252-
(double)min_wr / 1048576, (double)min_wr / 1073741824);
282+
(double)min_amnt / 1048576, (double)min_amnt / 1073741824);
253283
printf("Total write amount = %.2f MiB = %.2f GiB\n",
254-
(double)sum_amount_WR / 1048576, (double)sum_amount_WR / 1073741824);
284+
(double)sum_amount / 1048576, (double)sum_amount / 1073741824);
255285
printf("-----------------------------------------------------------\n");
256-
printf("Time of I/O preparing min/max = %8.4f / %8.4f\n", min_dbl[0], max_dbl[0]);
257-
printf("Time of file open/create min/max = %8.4f / %8.4f\n", min_dbl[1], max_dbl[1]);
258-
printf("Time of define variables min/max = %8.4f / %8.4f\n", min_dbl[2], max_dbl[2]);
259-
printf("Time of posting write requests min/max = %8.4f / %8.4f\n", min_dbl[3], max_dbl[3]);
260-
printf("Time of write flushing min/max = %8.4f / %8.4f\n", min_dbl[4], max_dbl[4]);
261-
printf("Time of close min/max = %8.4f / %8.4f\n", min_dbl[5], max_dbl[5]);
262-
printf("end-to-end time min/max = %8.4f / %8.4f\n", min_dbl[6], max_dbl[6]);
286+
printf("Time of I/O preparing min/max = %8.4f / %8.4f\n", min_t[0], max_t[0]);
287+
printf("Time of file open/create min/max = %8.4f / %8.4f\n", min_t[1], max_t[1]);
288+
printf("Time of define variables min/max = %8.4f / %8.4f\n", min_t[2], max_t[2]);
289+
printf("Time of posting write requests min/max = %8.4f / %8.4f\n", min_t[3], max_t[3]);
290+
printf("Time of write flushing min/max = %8.4f / %8.4f\n", min_t[4], max_t[4]);
291+
printf("Time of close min/max = %8.4f / %8.4f\n", min_t[5], max_t[5]);
292+
printf("end-to-end time min/max = %8.4f / %8.4f\n", min_t[6], max_t[6]);
263293
printf("Emulate computation time (sleep) min/max = %8.4f / %8.4f\n", (double)(cfg->comp_time), (double)(cfg->comp_time));
264294
printf("I/O bandwidth in MiB/sec (write-only) = %.4f\n",
265-
(double)sum_amount_WR / 1048576.0 / wTime);
295+
(double)sum_amount / 1048576.0 / ioTime);
266296
printf("I/O bandwidth in MiB/sec (open-to-close) = %.4f\n",
267-
(double)sum_amount_WR / 1048576.0 / max_dbl[6]);
297+
(double)sum_amount / 1048576.0 / max_t[6]);
268298
printf("-----------------------------------------------------------\n");
269299

270300
/* print MPI-IO hints actually used */
@@ -308,12 +338,11 @@ int print_timing_RD(e3sm_io_config *cfg,
308338
e3sm_io_decom *decom,
309339
case_meta *cmeta)
310340
{
311-
int i, ndecomp;
312-
MPI_Offset off_msg[3], sum_off[3];
313-
MPI_Offset sum_nreqs, sum_amount_RD, max_nreqs, min_nreqs;
341+
int i, j, ndecomp;
314342
MPI_Offset vlen, sum_decomp_varlen;
315-
double wTime;
316343
MPI_Comm comm=cfg->io_comm;
344+
double ioTime;
345+
perf_metrics metrics, *all_metrics;
317346

318347
ndecomp = decom->num_decomp;
319348

@@ -336,29 +365,54 @@ int print_timing_RD(e3sm_io_config *cfg,
336365
* total size.
337366
*/
338367

339-
off_msg[0] = cmeta->my_nreqs;
340-
off_msg[1] = cmeta->amount_RD;
341-
off_msg[2] = sum_decomp_varlen;
342-
MPI_Reduce(off_msg, sum_off, 3, MPI_OFFSET, MPI_SUM, 0, comm);
343-
sum_nreqs = sum_off[0];
344-
sum_amount_RD = sum_off[1];
345-
sum_decomp_varlen = sum_off[2];
346-
347-
MPI_Reduce(&cmeta->my_nreqs, &max_nreqs, 1, MPI_OFFSET, MPI_MAX, 0, comm);
348-
MPI_Reduce(&cmeta->my_nreqs, &min_nreqs, 1, MPI_OFFSET, MPI_MIN, 0, comm);
349-
350-
double dbl_tmp[7], max_dbl[7], min_dbl[7];
351-
dbl_tmp[0] = cmeta->pre_time;
352-
dbl_tmp[1] = cmeta->open_time;
353-
dbl_tmp[2] = cmeta->def_time;
354-
dbl_tmp[3] = cmeta->post_time;
355-
dbl_tmp[4] = cmeta->flush_time;
356-
dbl_tmp[5] = cmeta->close_time;
357-
dbl_tmp[6] = cmeta->end2end_time;
358-
MPI_Reduce(dbl_tmp, max_dbl, 7, MPI_DOUBLE, MPI_MAX, 0, comm);
359-
MPI_Reduce(dbl_tmp, min_dbl, 7, MPI_DOUBLE, MPI_MIN, 0, comm);
368+
metrics.nreqs = cmeta->my_nreqs;
369+
metrics.amount = cmeta->amount_RD;
370+
metrics.sum_decomp_varlen = sum_decomp_varlen;
371+
metrics.timing[0] = cmeta->pre_time;
372+
metrics.timing[1] = cmeta->open_time;
373+
metrics.timing[2] = cmeta->def_time;
374+
metrics.timing[3] = cmeta->post_time;
375+
metrics.timing[4] = cmeta->flush_time;
376+
metrics.timing[5] = cmeta->close_time;
377+
metrics.timing[6] = cmeta->end2end_time;
378+
379+
if (cfg->rank == 0)
380+
all_metrics = (perf_metrics*) malloc(sizeof(perf_metrics) * cfg->np);
381+
382+
MPI_Gather(&metrics, sizeof(perf_metrics), MPI_BYTE,
383+
all_metrics, sizeof(perf_metrics), MPI_BYTE, 0, comm);
360384

361385
if (cfg->rank == 0) {
386+
MPI_Offset sum_nreqs, sum_amount, max_nreqs, min_nreqs, max_amnt, min_amnt;
387+
double max_t[7], min_t[7];
388+
389+
sum_nreqs = 0;
390+
sum_amount = 0;
391+
sum_decomp_varlen = 0;
392+
max_nreqs = 0;
393+
max_amnt = 0;
394+
min_nreqs = LONG_MAX;
395+
min_amnt = LONG_MAX;
396+
for (j=0; j<7; j++) {
397+
max_t[j] = 0;
398+
min_t[j] = (double)LONG_MAX;
399+
}
400+
401+
for (i=0; i<cfg->np; i++) {
402+
sum_nreqs += all_metrics[i].nreqs;
403+
sum_amount += all_metrics[i].amount;
404+
sum_decomp_varlen += all_metrics[i].sum_decomp_varlen;
405+
max_nreqs = MAX(max_nreqs, all_metrics[i].nreqs);
406+
max_amnt = MAX(max_amnt, all_metrics[i].amount);
407+
min_nreqs = MIN(min_nreqs, all_metrics[i].nreqs);
408+
min_amnt = MIN(min_amnt, all_metrics[i].amount);
409+
for (j=0; j<7; j++) {
410+
max_t[j] = MAX(max_t[j], all_metrics[i].timing[j]);
411+
min_t[j] = MIN(min_t[j], all_metrics[i].timing[j]);
412+
}
413+
}
414+
free(all_metrics);
415+
362416
int nvars_noD = cmeta->nvars;
363417
for (i=0; i<ndecomp; i++) nvars_noD -= cmeta->nvars_D[i];
364418

@@ -379,40 +433,64 @@ int print_timing_RD(e3sm_io_config *cfg,
379433
printf("Variable read order: same as variables are defined\n");
380434

381435
if (cfg->strategy == canonical) {
382-
if (cfg->api == pnetcdf)
436+
if (cfg->api == pnetcdf) {
383437
printf("==== PnetCDF canonical I/O using varn API ============\n");
384-
else if (cfg->api == hdf5)
438+
printf("PnetCDF library version = %s\n",cfg->pnetcdf_ver);
439+
}
440+
else if (cfg->api == hdf5) {
385441
printf("==== HDF5 canonical I/O ==============================\n");
386-
else if (cfg->api == hdf5_md)
442+
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
443+
}
444+
else if (cfg->api == hdf5_md) {
387445
printf("==== HDF5 canonical I/O using multi-dataset API ======\n");
388-
else if (cfg->api == netcdf4)
446+
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
447+
}
448+
else if (cfg->api == netcdf4) {
389449
printf("==== NetCDF-4 canonical I/O ==========================\n");
450+
printf("NetCDF4 library version = %s\n",cfg->netcdf4_ver);
451+
}
390452
}
391453
else if (cfg->strategy == log) {
392-
if (cfg->api == hdf5)
454+
if (cfg->api == hdf5) {
393455
printf("==== HDF5 using log-based VOL through native APIs=====\n");
394-
else if (cfg->api == hdf5_log)
456+
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
457+
printf("HDF5 Log VOL connector version = %s\n",cfg->hdf5_log_ver);
458+
}
459+
else if (cfg->api == hdf5_log) {
395460
printf("==== HDF5 using log-based VOL APIs ===================\n");
396-
else if (cfg->api == netcdf4)
461+
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
462+
printf("HDF5 Log VOL connector version = %s\n",cfg->hdf5_log_ver);
463+
}
464+
else if (cfg->api == netcdf4) {
397465
printf("==== NetCDF-4 using HDF5 log-based VOL ===============\n");
466+
printf("NetCDF4 library version = %s\n",cfg->netcdf4_ver);
467+
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
468+
printf("HDF5 Log VOL connector version = %s\n",cfg->hdf5_log_ver);
469+
}
398470
}
399471
else if (cfg->strategy == blob) {
400-
if (cfg->api == pnetcdf)
472+
if (cfg->api == pnetcdf) {
401473
printf("==== PnetCDF blob I/O ================================\n");
402-
else if (cfg->api == hdf5)
474+
printf("PnetCDF library version = %s\n",cfg->pnetcdf_ver);
475+
}
476+
else if (cfg->api == hdf5) {
403477
printf("==== HDF5 blob I/O ===================================\n");
404-
else if (cfg->api == adios)
478+
printf("HDF5 library version = %s\n",cfg->hdf5_ver);
479+
}
480+
else if (cfg->api == adios) {
405481
printf("==== ADIOS blob I/O ==================================\n");
482+
printf("ADIOS library version = %s\n",cfg->adios_ver);
483+
}
406484
}
407485

408486
if (cfg->strategy == canonical && (cfg->api == netcdf4 || cfg->api == hdf5))
409-
wTime = max_dbl[3];
487+
ioTime = max_t[3];
410488
else if (cfg->strategy == log && cfg->api == netcdf4)
411-
wTime = max_dbl[5];
489+
ioTime = max_t[5];
412490
else if (cfg->api == pnetcdf || cfg->strategy == log || cfg->api == hdf5_md)
413-
wTime = max_dbl[4];
491+
ioTime = max_t[4];
414492
else /* read happens at file close for hdf5 blob and adios blob */
415-
wTime = max_dbl[5];
493+
ioTime = max_t[5];
416494

417495
if (cfg->strategy == log && (cfg->api == hdf5_log || cfg->api == netcdf4)) {
418496
if (cfg->num_subfiles != 0) {
@@ -465,27 +543,35 @@ int print_timing_RD(e3sm_io_config *cfg,
465543
printf("I/O flush frequency = %6d\n", cmeta->ffreq);
466544
printf("No. I/O flush calls = %6d\n", cmeta->num_flushes);
467545
printf("-----------------------------------------------------------\n");
468-
printf("Total read amount = %.2f MiB = %.2f GiB\n",
469-
(double)sum_amount_RD / 1048576, (double)sum_amount_RD / 1073741824);
470-
printf("Time of I/O preparing min/max = %8.4f / %8.4f\n", min_dbl[0], max_dbl[0]);
471-
printf("Time of file open/create min/max = %8.4f / %8.4f\n", min_dbl[1], max_dbl[1]);
472-
printf("Time of define variables min/max = %8.4f / %8.4f\n", min_dbl[2], max_dbl[2]);
473-
printf("Time of posting read requests min/max = %8.4f / %8.4f\n", min_dbl[3], max_dbl[3]);
474-
printf("Time of read flushing min/max = %8.4f / %8.4f\n", min_dbl[4], max_dbl[4]);
475-
printf("Time of close min/max = %8.4f / %8.4f\n", min_dbl[5], max_dbl[5]);
476-
printf("end-to-end time min/max = %8.4f / %8.4f\n", min_dbl[6], max_dbl[6]);
546+
printf("Indiv write amount MAX = %.2f MiB = %.2f GiB\n",
547+
(double)max_amnt / 1048576, (double)max_amnt / 1073741824);
548+
printf("Indiv write amount MIN = %.2f MiB = %.2f GiB\n",
549+
(double)min_amnt / 1048576, (double)min_amnt / 1073741824);
550+
printf("Total read amount = %.2f MiB = %.2f GiB\n",
551+
(double)sum_amount / 1048576, (double)sum_amount / 1073741824);
552+
printf("Time of I/O preparing min/max = %8.4f / %8.4f\n", min_t[0], max_t[0]);
553+
printf("Time of file open/create min/max = %8.4f / %8.4f\n", min_t[1], max_t[1]);
554+
printf("Time of define variables min/max = %8.4f / %8.4f\n", min_t[2], max_t[2]);
555+
printf("Time of posting read requests min/max = %8.4f / %8.4f\n", min_t[3], max_t[3]);
556+
printf("Time of read flushing min/max = %8.4f / %8.4f\n", min_t[4], max_t[4]);
557+
printf("Time of close min/max = %8.4f / %8.4f\n", min_t[5], max_t[5]);
558+
printf("end-to-end time min/max = %8.4f / %8.4f\n", min_t[6], max_t[6]);
477559
printf("Emulate computation time (sleep) min/max = %8.4f / %8.4f\n", (double)(cfg->comp_time), (double)(cfg->comp_time));
478560
printf("I/O bandwidth in MiB/sec (read-only) = %.4f\n",
479-
(double)sum_amount_RD / 1048576.0 / wTime);
561+
(double)sum_amount / 1048576.0 / ioTime);
480562
printf("I/O bandwidth in MiB/sec (open-to-close) = %.4f\n",
481-
(double)sum_amount_RD / 1048576.0 / max_dbl[6]);
563+
(double)sum_amount / 1048576.0 / max_t[6]);
482564
printf("-----------------------------------------------------------\n");
483565

484566
/* print MPI-IO hints actually used */
485567
if (cfg->verbose) print_info(&cmeta->info_used);
486568
}
487569
fflush(stdout);
488570

571+
#ifdef ENABLE_PNC
572+
pnetcdf_check_mem_usage(comm);
573+
#endif
574+
489575
if (cmeta->info_used != MPI_INFO_NULL) MPI_Info_free(&cmeta->info_used);
490576

491577
return 0;

0 commit comments

Comments
 (0)