Skip to content

Commit f806e22

Browse files
author
shanedsnyder
authored
Merge pull request #830 from darshan-hpc/snyder/dev-precise-job-timestamps
ENH: improved precision of darshan job start/end times
2 parents 9731f44 + 5410a73 commit f806e22

26 files changed

+306
-140
lines changed

.github/workflows/main_ci.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ jobs:
2222
platform: [ubuntu-latest,
2323
macos-latest]
2424
python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11"]
25+
# see gh-851
26+
exclude:
27+
- platform: ubuntu-latest
28+
python-version: "3.10"
29+
include:
30+
- platform: ubuntu-22.04
31+
python-version: "3.10"
2532
runs-on: ${{ matrix.platform }}
2633
steps:
2734
- uses: actions/checkout@v3

darshan-runtime/lib/darshan-core.c

Lines changed: 113 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,12 @@ static int darshan_deflate_buffer(
151151
static void darshan_core_cleanup(
152152
struct darshan_core_runtime* core);
153153
static void darshan_core_fork_child_cb(void);
154+
static void darshan_core_reduce_min_time(
155+
void* in_time_v, void* inout_time_v,
156+
int *len, MPI_Datatype *datatype);
157+
static void darshan_core_reduce_max_time(
158+
void* in_time_v, void* inout_time_v,
159+
int *len, MPI_Datatype *datatype);
154160

155161
#define DARSHAN_WARN(__err_str, ...) do { \
156162
darshan_core_fprintf(stderr, "darshan_library_warning: " \
@@ -197,6 +203,7 @@ void darshan_core_initialize(int argc, char **argv)
197203
int jobid;
198204
int ret;
199205
int i;
206+
struct timespec start_ts;
200207

201208
/* setup darshan runtime if darshan is enabled and hasn't been initialized already */
202209
if (__darshan_core != NULL || getenv("DARSHAN_DISABLE"))
@@ -309,7 +316,9 @@ void darshan_core_initialize(int argc, char **argv)
309316

310317
/* set known job-level metadata fields for the log file */
311318
init_core->log_job_p->uid = getuid();
312-
init_core->log_job_p->start_time = time(NULL);
319+
clock_gettime(CLOCK_REALTIME, &start_ts);
320+
init_core->log_job_p->start_time_sec = (int64_t)start_ts.tv_sec;
321+
init_core->log_job_p->start_time_nsec = (int64_t)start_ts.tv_nsec;
313322
init_core->log_job_p->nprocs = nprocs;
314323
init_core->log_job_p->jobid = (int64_t)jobid;
315324

@@ -390,6 +399,7 @@ void darshan_core_shutdown(int write_log)
390399
{
391400
struct darshan_core_runtime *final_core;
392401
double start_log_time;
402+
struct timespec end_ts;
393403
int internal_timing_flag;
394404
double open1 = 0, open2 = 0;
395405
double job1 = 0, job2 = 0;
@@ -408,6 +418,8 @@ void darshan_core_shutdown(int write_log)
408418
int i;
409419
int ret;
410420
#ifdef HAVE_MPI
421+
MPI_Datatype ts_type;
422+
MPI_Op ts_max_op, ts_min_op;
411423
darshan_record_id *shared_recs = NULL;
412424
darshan_record_id *mod_shared_recs = NULL;
413425
int shared_rec_cnt = 0;
@@ -442,7 +454,9 @@ void darshan_core_shutdown(int write_log)
442454
PMPI_Barrier(final_core->mpi_comm);
443455
#endif
444456
start_log_time = darshan_core_wtime_absolute();
445-
final_core->log_job_p->end_time = time(NULL);
457+
clock_gettime(CLOCK_REALTIME, &end_ts);
458+
final_core->log_job_p->end_time_sec = (int64_t)end_ts.tv_sec;
459+
final_core->log_job_p->end_time_nsec = (int64_t)end_ts.tv_nsec;
446460

447461
internal_timing_flag = final_core->config.internal_timing_flag;
448462

@@ -476,22 +490,30 @@ void darshan_core_shutdown(int write_log)
476490
MPI_INT, MPI_SUM, final_core->mpi_comm);
477491

478492
/* reduce to report first start and last end time across all ranks at rank 0 */
493+
/* NOTE: custom MPI max/min reduction operators required for sec/nsec time tuples */
494+
PMPI_Type_contiguous(2, MPI_INT64_T, &ts_type);
495+
PMPI_Type_commit(&ts_type);
496+
PMPI_Op_create(darshan_core_reduce_min_time, 1, &ts_min_op);
497+
PMPI_Op_create(darshan_core_reduce_max_time, 1, &ts_max_op);
479498
if(my_rank == 0)
480499
{
481-
PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->start_time,
482-
1, MPI_INT64_T, MPI_MIN, 0, final_core->mpi_comm);
483-
PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->end_time,
484-
1, MPI_INT64_T, MPI_MAX, 0, final_core->mpi_comm);
500+
PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->start_time_sec,
501+
1, ts_type, ts_min_op, 0, final_core->mpi_comm);
502+
PMPI_Reduce(MPI_IN_PLACE, &final_core->log_job_p->end_time_sec,
503+
1, ts_type, ts_max_op, 0, final_core->mpi_comm);
485504
}
486505
else
487506
{
488-
PMPI_Reduce(&final_core->log_job_p->start_time,
489-
&final_core->log_job_p->start_time,
490-
1, MPI_INT64_T, MPI_MIN, 0, final_core->mpi_comm);
491-
PMPI_Reduce(&final_core->log_job_p->end_time,
492-
&final_core->log_job_p->end_time,
493-
1, MPI_INT64_T, MPI_MAX, 0, final_core->mpi_comm);
507+
PMPI_Reduce(&final_core->log_job_p->start_time_sec,
508+
&final_core->log_job_p->start_time_sec,
509+
1, ts_type, ts_min_op, 0, final_core->mpi_comm);
510+
PMPI_Reduce(&final_core->log_job_p->end_time_sec,
511+
&final_core->log_job_p->end_time_sec,
512+
1, ts_type, ts_max_op, 0, final_core->mpi_comm);
494513
}
514+
PMPI_Type_free(&ts_type);
515+
PMPI_Op_free(&ts_min_op);
516+
PMPI_Op_free(&ts_max_op);
495517

496518
/* get a list of records which are shared across all processes */
497519
darshan_get_shared_records(final_core, &shared_recs, &shared_rec_cnt);
@@ -1486,7 +1508,6 @@ static void darshan_get_logfile_name(
14861508

14871509
jobid = core->log_job_p->jobid;
14881510
pid = core->pid;
1489-
start_time = core->log_job_p->start_time;
14901511

14911512
/* first, check if user specifies a complete logpath to use */
14921513
user_logfile_name = getenv("DARSHAN_LOGFILE");
@@ -1514,6 +1535,7 @@ static void darshan_get_logfile_name(
15141535
logmod = darshan_hash((void*)hname,strlen(hname),hlevel);
15151536

15161537
/* use human readable start time format in log filename */
1538+
start_time = (time_t)core->log_job_p->start_time_sec;
15171539
start_tm = localtime(&start_time);
15181540

15191541
if(core->config.log_path_byenv)
@@ -2262,6 +2284,84 @@ static int darshan_core_name_is_excluded(const char *name, darshan_module_id mod
22622284
return(0);
22632285
}
22642286

2287+
#ifdef HAVE_MPI
2288+
static void darshan_core_reduce_min_time(void* in_time_v, void* inout_time_v,
2289+
int *len, MPI_Datatype *datatype)
2290+
{
2291+
int64_t tmp_sec, tmp_nsec;
2292+
int64_t *in_sec = in_time_v;
2293+
int64_t *in_nsec = in_sec+1;
2294+
int64_t *inout_sec = inout_time_v;
2295+
int64_t *inout_nsec = inout_sec+1;
2296+
int i;
2297+
2298+
for(i=0; i<*len; i++)
2299+
{
2300+
/* min */
2301+
if((*in_sec < *inout_sec) ||
2302+
((*in_sec == *inout_sec) &&
2303+
(*in_nsec < *inout_nsec)))
2304+
{
2305+
tmp_sec = *in_sec;
2306+
tmp_nsec = *in_nsec;
2307+
}
2308+
else
2309+
{
2310+
tmp_sec = *inout_sec;
2311+
tmp_nsec = *inout_nsec;
2312+
}
2313+
2314+
/* update pointers */
2315+
*inout_sec = tmp_sec;
2316+
*inout_nsec = tmp_nsec;
2317+
inout_sec+=2;
2318+
inout_nsec+=2;
2319+
in_sec+=2;
2320+
in_nsec+=2;
2321+
}
2322+
2323+
return;
2324+
}
2325+
2326+
static void darshan_core_reduce_max_time(void* in_time_v, void* inout_time_v,
2327+
int *len, MPI_Datatype *datatype)
2328+
{
2329+
int64_t tmp_sec, tmp_nsec;
2330+
int64_t *in_sec = in_time_v;
2331+
int64_t *in_nsec = in_sec+1;
2332+
int64_t *inout_sec = inout_time_v;
2333+
int64_t *inout_nsec = inout_sec+1;
2334+
int i;
2335+
2336+
for(i=0; i<*len; i++)
2337+
{
2338+
/* max */
2339+
if((*in_sec > *inout_sec) ||
2340+
((*in_sec == *inout_sec) &&
2341+
(*in_nsec > *inout_nsec)))
2342+
{
2343+
tmp_sec = *in_sec;
2344+
tmp_nsec = *in_nsec;
2345+
}
2346+
else
2347+
{
2348+
tmp_sec = *inout_sec;
2349+
tmp_nsec = *inout_nsec;
2350+
}
2351+
2352+
/* update pointers */
2353+
*inout_sec = tmp_sec;
2354+
*inout_nsec = tmp_nsec;
2355+
inout_sec+=2;
2356+
inout_nsec+=2;
2357+
in_sec+=2;
2358+
in_nsec+=2;
2359+
}
2360+
2361+
return;
2362+
}
2363+
#endif
2364+
22652365
/* crude benchmarking hook into darshan-core to benchmark Darshan
22662366
* shutdown overhead using a variety of application I/O workloads
22672367
*/

darshan-util/darshan-analyzer.c

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,11 +97,7 @@ int process_log(const char *fname, double *io_ratio, int *used_mpio, int *used_p
9797
if (file->mod_map[DARSHAN_PNETCDF_FILE_MOD].len > 0 || file->mod_map[DARSHAN_PNETCDF_VAR_MOD].len > 0)
9898
*used_pnet += 1;
9999

100-
total_job_time = (double)job.end_time - (double)job.start_time;
101-
if (total_job_time < 1.0)
102-
{
103-
total_job_time = 1.0;
104-
}
100+
darshan_log_get_job_runtime(file, job, &total_job_time);
105101

106102
if (f_count > 0)
107103
{

darshan-util/darshan-diff.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,16 +133,16 @@ int main(int argc, char *argv[])
133133

134134
if (job1.uid != job2.uid)
135135
print_int64_diff("# uid:", job1.uid, job2.uid);
136-
if (job1.start_time != job2.start_time)
137-
print_int64_diff("# start_time:", job1.start_time, job2.start_time);
138-
if (job1.end_time != job2.end_time)
139-
print_int64_diff("# end_time:", job1.end_time, job2.end_time);
136+
if (job1.start_time_sec != job2.start_time_sec)
137+
print_int64_diff("# start_time:", job1.start_time_sec, job2.start_time_sec);
138+
if (job1.end_time_sec != job2.end_time_sec)
139+
print_int64_diff("# end_time:", job1.end_time_sec, job2.end_time_sec);
140140
if (job1.nprocs != job2.nprocs)
141141
print_int64_diff("# nprocs:", job1.nprocs, job2.nprocs);
142-
if ((job1.end_time-job1.start_time) != (job2.end_time - job2.start_time))
142+
if ((job1.end_time_sec-job1.start_time_sec) != (job2.end_time_sec-job2.start_time_sec))
143143
print_int64_diff("# run time:",
144-
(int64_t)(job1.end_time - job1.start_time + 1),
145-
(int64_t)(job2.end_time - job2.start_time + 1));
144+
(int64_t)(job1.end_time_sec - job1.start_time_sec + 1),
145+
(int64_t)(job2.end_time_sec - job2.start_time_sec + 1));
146146

147147
/* get hash of record ids to file names for each log */
148148
ret = darshan_log_get_namehash(file1, &name_hash1);

darshan-util/darshan-dxt-parser.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ int main(int argc, char **argv)
4343
int mount_count;
4444
struct darshan_mnt_info *mnt_data_array;
4545
time_t tmp_time = 0;
46-
int64_t run_time = 0;
46+
double run_time;
4747
char *token;
4848
char *save;
4949
char buffer[DARSHAN_JOB_METADATA_LEN];
@@ -107,17 +107,16 @@ int main(int argc, char **argv)
107107
printf("# exe: %s\n", tmp_string);
108108
printf("# uid: %" PRId64 "\n", job.uid);
109109
printf("# jobid: %" PRId64 "\n", job.jobid);
110-
printf("# start_time: %" PRId64 "\n", job.start_time);
111-
tmp_time += job.start_time;
110+
printf("# start_time: %" PRId64 "\n", job.start_time_sec);
111+
tmp_time += job.start_time_sec;
112112
printf("# start_time_asci: %s", ctime(&tmp_time));
113-
printf("# end_time: %" PRId64 "\n", job.end_time);
113+
printf("# end_time: %" PRId64 "\n", job.end_time_sec);
114114
tmp_time = 0;
115-
tmp_time += job.end_time;
115+
tmp_time += job.end_time_sec;
116116
printf("# end_time_asci: %s", ctime(&tmp_time));
117117
printf("# nprocs: %" PRId64 "\n", job.nprocs);
118-
if (job.end_time >= job.start_time)
119-
run_time = job.end_time - job.start_time + 1;
120-
printf("# run time: %" PRId64 "\n", run_time);
118+
darshan_log_get_job_runtime(fd, job, &run_time);
119+
printf("# run time: %.4lf\n", run_time);
121120
for (token = strtok_r(job.metadata, "\n", &save);
122121
token != NULL;
123122
token = strtok_r(NULL, "\n", &save))

0 commit comments

Comments
 (0)