Skip to content

Commit f8452d6

Browse files
committed
Duplicate mmap log file at the beginning of darshan_core_shutdown
This is in case an interrupt occurs while processing the log file, resulting the log file corrupted.
1 parent b781182 commit f8452d6

File tree

1 file changed

+47
-6
lines changed

1 file changed

+47
-6
lines changed

darshan-runtime/lib/darshan-core.c

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,10 @@ void darshan_core_shutdown(int write_log)
433433
darshan_record_id *mod_shared_recs = NULL;
434434
int shared_rec_cnt = 0;
435435
#endif
436+
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
437+
char dup_log_fame[__DARSHAN_PATH_MAX + 5];
438+
dup_log_fame[0] = '\0';
439+
#endif
436440

437441
/* disable darhan-core while we shutdown */
438442
__DARSHAN_CORE_LOCK();
@@ -470,13 +474,41 @@ void darshan_core_shutdown(int write_log)
470474
internal_timing_flag = final_core->config.internal_timing_flag;
471475

472476
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
473-
/* remove the temporary mmap log files */
474-
/* NOTE: this unlink is not immediate as it must wait for the mapping
475-
* to no longer be referenced, which in our case happens when the
476-
* executable exits. If the application terminates mid-shutdown, then
477-
* there will be no mmap files and no final log file.
477+
/* Flush memory-mapped data to the underlying file and then duplicate the
478+
* mmap log file, in case an interrupt happens before the completion of
479+
* this subroutine, leaving the log file corrupted. See github issue #1052.
478480
*/
479-
unlink(final_core->mmap_log_name);
481+
int sys_page_size = sysconf(_SC_PAGESIZE);
482+
size_t mmap_size = sizeof(struct darshan_header) + DARSHAN_JOB_RECORD_SIZE +
483+
+ core->config.name_mem + core->config.mod_mem;
484+
if (mmap_size % sys_page_size)
485+
mmap_size = ((mmap_size / sys_page_size) + 1) * sys_page_size;
486+
487+
msync(final_core->log_hdr_p, mmap_size, MS_SYNC);
488+
489+
/* duplicate the log file */
490+
int mmap_fd;
491+
mmap_fd = open(final_core->mmap_log_name, O_RDONLY, 0644);
492+
if (mmap_fd != -1) {
493+
void *buf;
494+
off_t fileSize = lseek(fd, 0, SEEK_END);
495+
if (fileSize >= 0) {
496+
buf = (void*) malloc(fileSize);
497+
lseek(fd, 0, SEEK_SET);
498+
read(mmap_fd, fileSize, buf);
499+
close(mmap_fd);
500+
snprintf(dup_log_fame, "%s.dup", strlen(final_core->mmap_log_name),
501+
final_core->mmap_log_name);
502+
mmap_fd = open(dup_log_fame, O_CREAT | O_WRONLY, 0644);
503+
if (mmap_fd != -1) {
504+
write(mmap_fd, fileSize, buf);
505+
close(mmap_fd);
506+
}
507+
free(buf);
508+
}
509+
else
510+
close(mmap_fd);
511+
}
480512
#endif
481513

482514
final_core->comp_buf = malloc(final_core->config.mod_mem);
@@ -770,14 +802,23 @@ void darshan_core_shutdown(int write_log)
770802
for(i = 0; i < DARSHAN_KNOWN_MODULE_COUNT; i++)
771803
if(final_core->mod_array[i])
772804
final_core->mod_array[i]->mod_funcs.mod_cleanup_func();
805+
806+
#ifdef __DARSHAN_ENABLE_MMAP_LOGS
807+
/* remove the temporary mmap log files */
808+
unlink(final_core->mmap_log_name);
809+
if (dup_log_fame[0] != '\0') unlink(dup_log_fame);
810+
#endif
811+
773812
darshan_core_cleanup(final_core);
813+
774814
#ifdef HAVE_MPI
775815
if(using_mpi)
776816
{
777817
free(shared_recs);
778818
free(mod_shared_recs);
779819
}
780820
#endif
821+
781822
free(logfile_name);
782823

783824
return;

0 commit comments

Comments
 (0)