@@ -433,6 +433,10 @@ void darshan_core_shutdown(int write_log)
433433 darshan_record_id * mod_shared_recs = NULL ;
434434 int shared_rec_cnt = 0 ;
435435#endif
436+ #ifdef __DARSHAN_ENABLE_MMAP_LOGS
437+ char dup_log_fame [__DARSHAN_PATH_MAX + 5 ];
438+ dup_log_fame [0 ] = '\0' ;
439+ #endif
436440
437441 /* disable darhan-core while we shutdown */
438442 __DARSHAN_CORE_LOCK ();
@@ -470,13 +474,41 @@ void darshan_core_shutdown(int write_log)
470474 internal_timing_flag = final_core -> config .internal_timing_flag ;
471475
472476#ifdef __DARSHAN_ENABLE_MMAP_LOGS
473- /* remove the temporary mmap log files */
474- /* NOTE: this unlink is not immediate as it must wait for the mapping
475- * to no longer be referenced, which in our case happens when the
476- * executable exits. If the application terminates mid-shutdown, then
477- * there will be no mmap files and no final log file.
477+ /* Flush memory-mapped data to the underlying file and then duplicate the
478+ * mmap log file, in case an interrupt happens before the completion of
479+ * this subroutine, leaving the log file corrupted. See github issue #1052.
478480 */
479- unlink (final_core -> mmap_log_name );
481+ int sys_page_size = sysconf (_SC_PAGESIZE );
482+ size_t mmap_size = sizeof (struct darshan_header ) + DARSHAN_JOB_RECORD_SIZE +
483+ + core -> config .name_mem + core -> config .mod_mem ;
484+ if (mmap_size % sys_page_size )
485+ mmap_size = ((mmap_size / sys_page_size ) + 1 ) * sys_page_size ;
486+
487+ msync (final_core -> log_hdr_p , mmap_size , MS_SYNC );
488+
489+ /* duplicate the log file */
490+ int mmap_fd ;
491+ mmap_fd = open (final_core -> mmap_log_name , O_RDONLY , 0644 );
492+ if (mmap_fd != -1 ) {
493+ void * buf ;
494+ off_t fileSize = lseek (fd , 0 , SEEK_END );
495+ if (fileSize >= 0 ) {
496+ buf = (void * ) malloc (fileSize );
497+ lseek (fd , 0 , SEEK_SET );
498+ read (mmap_fd , fileSize , buf );
499+ close (mmap_fd );
500+ snprintf (dup_log_fame , "%s.dup" , strlen (final_core -> mmap_log_name ),
501+ final_core -> mmap_log_name );
502+ mmap_fd = open (dup_log_fame , O_CREAT | O_WRONLY , 0644 );
503+ if (mmap_fd != -1 ) {
504+ write (mmap_fd , fileSize , buf );
505+ close (mmap_fd );
506+ }
507+ free (buf );
508+ }
509+ else
510+ close (mmap_fd );
511+ }
480512#endif
481513
482514 final_core -> comp_buf = malloc (final_core -> config .mod_mem );
@@ -770,14 +802,23 @@ void darshan_core_shutdown(int write_log)
770802 for (i = 0 ; i < DARSHAN_KNOWN_MODULE_COUNT ; i ++ )
771803 if (final_core -> mod_array [i ])
772804 final_core -> mod_array [i ]-> mod_funcs .mod_cleanup_func ();
805+
806+ #ifdef __DARSHAN_ENABLE_MMAP_LOGS
807+ /* remove the temporary mmap log files */
808+ unlink (final_core -> mmap_log_name );
809+ if (dup_log_fame [0 ] != '\0' ) unlink (dup_log_fame );
810+ #endif
811+
773812 darshan_core_cleanup (final_core );
813+
774814#ifdef HAVE_MPI
775815 if (using_mpi )
776816 {
777817 free (shared_recs );
778818 free (mod_shared_recs );
779819 }
780820#endif
821+
781822 free (logfile_name );
782823
783824 return ;
0 commit comments