diff --git a/INSTALL b/INSTALL index 6b9bdb509..524e4463b 100644 --- a/INSTALL +++ b/INSTALL @@ -147,8 +147,6 @@ Here lists a few important options: size when argument count is zero. [default: enabled] --enable-doxygen Enable generation of documentation. [default: disabled] - --disable-file-sync Disable MPI file sync if you know your file system - can provide data consistency. [default: enabled] --enable-large-file-test Enable testing for large (>4GB) file/variable I/O. Note "make check" can run very slow. [default: diff --git a/configure.ac b/configure.ac index 609044688..3c64009d3 100644 --- a/configure.ac +++ b/configure.ac @@ -130,7 +130,6 @@ AH_TEMPLATE([NF_INT8_IS_C_], [C type for Fortran INT8]) AH_TEMPLATE([NF_INT8_T], [Type for Fortran INT8]) AH_TEMPLATE([NF_REAL_IS_C_], [C type for Fortran REAL]) AH_TEMPLATE([NO_IEEE_FLOAT], [Does system have IEEE FLOAT]) -AH_TEMPLATE([DISABLE_FILE_SYNC], [Define if to disable MPI_File_sync]) dnl AH_TEMPLATE([ENABLE_IN_PLACE_SWAP], [Define if to enable in-place byte swap]) dnl AH_TEMPLATE([DISABLE_IN_PLACE_SWAP],[Define if to disable in-place byte swap]) AH_TEMPLATE([ENABLE_SUBFILING], [Define if to enable subfiling feature]) @@ -2221,16 +2220,6 @@ AC_SUBST(LATEX) AC_SUBST(DVIPDF) AM_CONDITIONAL([HAS_LATEX], [test "x$has_latex" = xyes]) -AC_ARG_ENABLE([file-sync], - [AS_HELP_STRING([--disable-file-sync], - [Disable MPI file sync if you know your file system can - provide data consistency. @<:@default: enabled@:>@])], - [file_sync=${enableval}], [file_sync=yes] -) -if test "x${file_sync}" = xno ; then - AC_DEFINE(DISABLE_FILE_SYNC) -fi - AC_ARG_ENABLE([large-single-req], [AS_HELP_STRING([--enable-large-single-req], [Enable large (> 2 GiB) single request in individual MPI-IO diff --git a/doc/README.consistency.md b/doc/README.consistency.md index 56a445873..d4c916afd 100644 --- a/doc/README.consistency.md +++ b/doc/README.consistency.md @@ -1,7 +1,25 @@ ## Note on parallel I/O data consistency -PnetCDF follows the same parallel I/O data consistency as MPI-IO standard. -Refer the URL below for more information. +PnetCDF follows the same parallel I/O data consistency as MPI-IO standard, +quoted below. + +``` +Consistency semantics define the outcome of multiple accesses to a single file. +All file accesses in MPI are relative to a specific file handle created from a +collective open. MPI provides three levels of consistency: + * sequential consistency among all accesses using a single file handle, + * sequential consistency among all accesses using file handles created from a + single collective open with atomic mode enabled, and + * user-imposed consistency among accesses other than the above. +Sequential consistency means the behavior of a set of operations will be as if +the operations were performed in some serial order consistent with program +order; each access appears atomic, although the exact ordering of accesses is +unspecified. User-imposed consistency may be obtained using program order and +calls to MPI_FILE_SYNC. +``` + +Users are referred to the MPI standard Chapter 14.6 Consistency and Semantics +for more information. http://www.mpi-forum.org/docs/mpi-2.2/mpi22-report/node296.htm#Node296 Readers are also referred to the following paper. @@ -9,19 +27,27 @@ Rajeev Thakur, William Gropp, and Ewing Lusk, On Implementing MPI-IO Portably and with High Performance, in the Proceedings of the 6th Workshop on I/O in Parallel and Distributed Systems, pp. 23-32, May 1999. -If users would like PnetCDF to enforce a stronger consistency, they should add -NC_SHARE flag when open/create the file. By doing so, PnetCDF adds -MPI_File_sync() after each MPI I/O calls. - * For PnetCDF collective APIs, an MPI_Barrier() will also be called right - after MPI_File_sync(). - * For independent APIs, there is no need for calling MPI_Barrier(). - -Users are warned that the I/O performance when using NC_SHARE flag could become -significantly slower than not using it. - -If NC_SHARE is not set, then users are responsible for their desired data -consistency. To enforce a stronger consistency, users can explicitly call -ncmpi_sync(). In ncmpi_sync(), MPI_File_sync() and MPI_Barrier() are called. +* NC_SHARE has been deprecated in PnetCDF release of 1.13.0. + + NC_SHARE is a legacy flag inherited from NetCDF-3, whose purpose is to + provide some degree of data consistency for multiple processes concurrently + accessing a shared file. To achieve a stronger consistency, user + applications are required to also synchronize the processes, such as + calling MPI_Barrier, together with nc_sync. + + Because PnetCDF follows the MPI file consistency, which only addresses the + case when all file accesses are relative to a specific file handle created + from a collective open, NC_SHARE becomes invalid. Note that NetCDF-3 + supports only sequential I/O and thus has no collective file open per se. + +If users would like a stronger consistency, they may consider using the code +fragment below after each collective write API call (e.g. +`ncmpi_put_vara_int_all`, `ncmpi_wait_all` `ncmpi_enddef`, `ncmpi_redef`, +`ncmpio_begin_indep_data`, `ncmpio_end_indep_data`). +``` + ncmpi_sync(ncid); + MPI_Barrier(comm); + ncmpi_sync(ncid); +``` +Users are warned that the I/O performance could become significantly slower. ### Note on header consistency in memory and file In data mode, changes to file header can happen in the following scenarios. diff --git a/man/pnetcdf.m4 b/man/pnetcdf.m4 index 1af08100d..cc8fb653b 100644 --- a/man/pnetcdf.m4 +++ b/man/pnetcdf.m4 @@ -495,10 +495,9 @@ Creates a new netCDF dataset at ARG(path) collectively by a group of MPI processes specified by ARG(comm), returning a netCDF ID in ARG(ncid). The argument ARG(cmode) may <> the bitwise-or of the following flags: MACRO(NOCLOBBER) to protect existing datasets (default is MACRO(CLOBBER), -silently blows them away), MACRO(SHARE) for stronger metadata data consistency -control, MACRO(64BIT_OFFSET) to create a file in the 64-bit offset format -(CDF-2), as opposed to classic format, the default, or MACRO(64BIT_DATA) to -create a file in the 64-bit data format (CDF-5). +silently blows them away), MACRO(64BIT_OFFSET) to create a file in the +64-bit offset format (CDF-2), as opposed to classic format, the default, or +MACRO(64BIT_DATA) to create a file in the 64-bit data format (CDF-5). Use either MACRO(64BIT_OFFSET) or MACRO(64BIT_DATA). The 64-bit offset format allows the creation of very large files with far fewer restrictions than netCDF classic format, but can only be read by the netCDF @@ -530,7 +529,7 @@ Opens an existing netCDF dataset at ARG(path) collectively by a group of MPI processes specified by ARG(comm), returning a netCDF ID in ARG(ncid). The type of access is described by the ARG(mode) parameter, which may <> the bitwise-or of the following flags: MACRO(WRITE) for read-write access (default -read-only), MACRO(SHARE) for stronger metadata data consistency control. +read-only). .sp ifelse(DAP,TRUE, <> flushes cached data by calling MPI_File_sync. .HP FDECL(abort, (INCID())) diff --git a/man/pnetcdf_f90.m4 b/man/pnetcdf_f90.m4 index cb88aa9a9..cf2f7489b 100644 --- a/man/pnetcdf_f90.m4 +++ b/man/pnetcdf_f90.m4 @@ -74,10 +74,9 @@ Creates a new netCDF dataset at \fIpath\fP collectively by a group of MPI processes specified by \fIcomm\fP, returning a netCDF ID in \fIncid\fP. The argument \fIcmode\fP may include the bitwise-or of the following flags: \fBnf90_noclobber\fR to protect existing datasets (default is \fBnf90_clobber\fR, -silently blows them away), \fBnf90_share\fR for stronger metadata data consistency -control, \fBnf90_64bit_offset\fR to create a file in the 64-bit offset format -(CDF-2), as opposed to classic format, the default, or \fBnf90_64bit_data\fR to -create a file in the 64-bit data format (CDF-5). +silently blows them away), \fBnf90_64bit_offset\fR to create a file in the +64-bit offset format (CDF-2), as opposed to classic format, the default, or +\fBnf90_64bit_data\fR to create a file in the 64-bit data format (CDF-5). Use either \fBnf90_64bit_offset\fR or \fBnf90_64bit_data\fR. The 64-bit offset format allows the creation of very large files with far fewer restrictions than netCDF classic format, but can only be read by the netCDF @@ -115,7 +114,7 @@ Opens an existing netCDF dataset at \fIpath\fP collectively by a group of MPI processes specified by \fIcomm\fP, returning a netCDF ID in \fIncid\fP. The type of access is described by the \fImode\fP parameter, which may include the bitwise-or of the following flags: \fBnf90_write\fR for read-write access (default -read-only), \fBnf90_share\fR for stronger metadata data consistency control. +read-only). .sp The argument \fImode\fP must be consistent among all MPI processes that @@ -158,11 +157,7 @@ integer, intent(in) :: ncid integer :: nf90mpi_sync .fi .sp -Unless the -\fBnf90_share\fR -bit is set in -\fBnf90mpi_open(\|)\fR or \fBnf90mpi_create(\|)\fR, -data written by PnetCDF APIs may be cached by local file system on each compute +Data written by PnetCDF APIs may be cached by local file system on each compute node. This API flushes cached data by calling MPI_File_sync. .RE .HP diff --git a/sneak_peek.md b/sneak_peek.md index 228080c3e..31c073ac6 100644 --- a/sneak_peek.md +++ b/sneak_peek.md @@ -22,6 +22,9 @@ This is essentially a placeholder for the next release note ... + none * Configure options + + `--disable-file-sync` is now deprecated. This configure option alone does + not provide a sufficient data consistency. Users are suggested to call + `ncmpi_sync` and `MPI_Barrier` to achieve a desired consistency. + `--enable-install-examples` to install example programs under folder `${prefix}/pnetcdf_examples` along with run script files. An example is `${prefix}/pnetcdf_examples/C/run_c_examples.sh`. The default of this @@ -53,10 +56,16 @@ This is essentially a placeholder for the next release note ... + none * API syntax changes - + none + + File open flag NC_SHARE is now deprecated. It is still defined, but takes + no effect. * API semantics updates - + none + + NC_SHARE alone is not sufficient to provide data consistency for accessing + a shared file in parallel and thus is now deprecated. Because PnetCDF + follows the MPI file consistency, which only addresses the case when all + file accesses are relative to a specific file handle created from a + collective open, NC_SHARE becomes invalid. See doc/README.consistency.md + for more information. * New error code precedence + none diff --git a/src/dispatchers/file.c b/src/dispatchers/file.c index 7d9db0058..6529101a1 100644 --- a/src/dispatchers/file.c +++ b/src/dispatchers/file.c @@ -1800,8 +1800,7 @@ ncmpi_set_default_format(int format, int *old_formatp) /*----< ncmpi_inq_default_format() >-----------------------------------------*/ /* returns a value suitable for a create flag. Will return one or more of the - * following values OR-ed together: - * NC_64BIT_OFFSET, NC_CLOBBER, NC_LOCK, NC_SHARE */ + * following values OR-ed together: NC_64BIT_OFFSET, NC_CLOBBER */ int ncmpi_inq_default_format(int *formatp) { diff --git a/src/drivers/ncmpio/ncmpio_NC.h b/src/drivers/ncmpio/ncmpio_NC.h index 93ed4afc4..44b05fdae 100644 --- a/src/drivers/ncmpio/ncmpio_NC.h +++ b/src/drivers/ncmpio/ncmpio_NC.h @@ -416,7 +416,7 @@ struct NC { #define NC_ndirty(ncp) fIsSet((ncp)->flags, NC_NDIRTY) #define set_NC_hdirty(ncp) fSet((ncp)->flags, NC_HDIRTY) #define NC_hdirty(ncp) fIsSet((ncp)->flags, NC_HDIRTY) -#define NC_doFsync(ncp) fIsSet((ncp)->iomode, NC_SHARE) + #define NC_doHsync(ncp) fIsSet((ncp)->flags, NC_HSYNC) #define NC_doNsync(ncp) fIsSet((ncp)->flags, NC_NSYNC) diff --git a/src/drivers/ncmpio/ncmpio_close.c b/src/drivers/ncmpio/ncmpio_close.c index b167906a7..d0b5e39a5 100644 --- a/src/drivers/ncmpio/ncmpio_close.c +++ b/src/drivers/ncmpio/ncmpio_close.c @@ -157,12 +157,6 @@ ncmpio_close(void *ncdp) } #endif - /* If the user wants a stronger data consistency by setting NC_SHARE */ - if (NC_doFsync(ncp)) { - err = ncmpio_file_sync(ncp); /* calling MPI_File_sync() */ - if (status == NC_NOERR) status = err; - } - /* calling MPI_File_close() */ err = ncmpio_close_files(ncp, 0); if (status == NC_NOERR) status = err; diff --git a/src/drivers/ncmpio/ncmpio_enddef.c b/src/drivers/ncmpio/ncmpio_enddef.c index 21a9f5e5c..b20b23075 100644 --- a/src/drivers/ncmpio/ncmpio_enddef.c +++ b/src/drivers/ncmpio/ncmpio_enddef.c @@ -1164,10 +1164,6 @@ ncmpio__enddef(void *ncdp, fClr(ncp->ncp_sf->flags, NC_MODE_CREATE | NC_MODE_DEF); #endif - /* If the user sets NC_SHARE, we enforce a stronger data consistency */ - if (NC_doFsync(ncp)) - ncmpio_file_sync(ncp); /* calling MPI_File_sync() */ - return status; } diff --git a/src/drivers/ncmpio/ncmpio_file_misc.c b/src/drivers/ncmpio/ncmpio_file_misc.c index 734f6975f..7bad1f6ed 100644 --- a/src/drivers/ncmpio/ncmpio_file_misc.c +++ b/src/drivers/ncmpio/ncmpio_file_misc.c @@ -85,17 +85,6 @@ ncmpio_redef(void *ncdp) if (NC_indep(ncp)) /* exit independent mode, if in independent mode */ ncmpio_end_indep_data(ncp); -#if 0 - /* header metadata is always sync-ed among all processes, except for - * numrecs when in independent data mode. It has been sync-ed above when - * calling ncmpio_end_indep_data() - */ - if (NC_doFsync(ncp)) { /* re-read the header from file */ - int err = ncmpio_read_NC(ncp); - if (err != NC_NOERR) return err; - } -#endif - /* duplicate a header to be used in enddef() for checking if header grows */ ncp->old = dup_NC(ncp); if (ncp->old == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) @@ -123,22 +112,8 @@ ncmpio_begin_indep_data(void *ncdp) DEBUG_RETURN_ERROR(NC_EINDEP) */ - /* we need no MPI_File_sync() here. If users want a stronger data - * consistency, they can call ncmpi_sync() - */ -#if 0 && !defined(DISABLE_FILE_SYNC) - if (!NC_readonly(ncp) && ncp->collective_fh != MPI_FILE_NULL) { - /* calling file sync for those already open the file */ - int err, mpireturn; - /* MPI_File_sync() is collective */ - TRACE_IO(MPI_File_sync)(ncp->collective_fh); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_File_sync"); - if (err == NC_NOERR) return err; - } - TRACE_COMM(MPI_Barrier)(ncp->comm); - } -#endif + /* If users want a stronger data consistency, ncmpi_sync() should be called + * following this subroutine. */ /* raise independent flag */ fSet(ncp->flags, NC_MODE_INDEP); @@ -193,22 +168,6 @@ ncmpio_end_indep_data(void *ncdp) status = ncmpio_sync_numrecs(ncp); /* the only possible dirty part of the header is numrecs */ } - -#ifndef DISABLE_FILE_SYNC - /* calling file sync for those already open the file */ - if (NC_doFsync(ncp) && ncp->independent_fh != MPI_FILE_NULL) { - int mpireturn; - /* MPI_File_sync() is collective */ - TRACE_IO(MPI_File_sync)(ncp->independent_fh); - if (mpireturn != MPI_SUCCESS) { - int err = ncmpii_error_mpi2nc(mpireturn, "MPI_File_sync"); - if (status == NC_NOERR) status = err; - } - TRACE_COMM(MPI_Barrier)(ncp->comm); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, "MPI_Barrier"); - } -#endif } fClr(ncp->flags, NC_MODE_INDEP); @@ -247,11 +206,6 @@ ncmpio_abort(void *ncdp) /* exit independent mode, if in independent mode */ status = ncmpio_end_indep_data(ncp); /* will sync header */ } - - if (NC_doFsync(ncp)) { - err = ncmpio_file_sync(ncp); /* calling MPI_File_sync() */ - if (status == NC_NOERR ) status = err; - } } /* close the file */ diff --git a/src/drivers/ncmpio/ncmpio_getput.m4 b/src/drivers/ncmpio/ncmpio_getput.m4 index b5c473239..c13ea2a93 100644 --- a/src/drivers/ncmpio/ncmpio_getput.m4 +++ b/src/drivers/ncmpio/ncmpio_getput.m4 @@ -352,12 +352,6 @@ err_check: } } - if (NC_doFsync(ncp)) { /* NC_SHARE is set */ - TRACE_IO(MPI_File_sync)(fh); - if (fIsSet(reqMode, NC_REQ_COLL)) - TRACE_COMM(MPI_Barrier)(ncp->comm); - } - return status; } diff --git a/src/drivers/ncmpio/ncmpio_header_put.c b/src/drivers/ncmpio/ncmpio_header_put.c index ea3014dee..ca6591933 100644 --- a/src/drivers/ncmpio/ncmpio_header_put.c +++ b/src/drivers/ncmpio/ncmpio_header_put.c @@ -588,19 +588,6 @@ int ncmpio_write_header(NC *ncp) } } - if (NC_doFsync(ncp)) { /* NC_SHARE is set */ - TRACE_IO(MPI_File_sync)(fh); - if (mpireturn != MPI_SUCCESS) { - ncmpii_error_mpi2nc(mpireturn,"MPI_File_sync"); - DEBUG_RETURN_ERROR(NC_EMPI) - } - TRACE_COMM(MPI_Barrier)(ncp->comm); - if (mpireturn != MPI_SUCCESS) { - ncmpii_error_mpi2nc(mpireturn,"MPI_Barrier"); - DEBUG_RETURN_ERROR(NC_EMPI) - } - } - return status; } diff --git a/src/drivers/ncmpio/ncmpio_sync.c b/src/drivers/ncmpio/ncmpio_sync.c index 91f053890..d6c34f3d5 100644 --- a/src/drivers/ncmpio/ncmpio_sync.c +++ b/src/drivers/ncmpio/ncmpio_sync.c @@ -33,7 +33,6 @@ */ int ncmpio_file_sync(NC *ncp) { -#ifndef DISABLE_FILE_SYNC int mpireturn; if (ncp->independent_fh != MPI_FILE_NULL) { @@ -49,7 +48,7 @@ ncmpio_file_sync(NC *ncp) { return ncmpii_error_mpi2nc(mpireturn, "MPI_File_sync"); TRACE_COMM(MPI_Barrier)(ncp->comm); -#endif + return NC_NOERR; } @@ -205,24 +204,6 @@ ncmpio_sync_numrecs(void *ncdp) /* clear numrecs dirty bit */ fClr(ncp->flags, NC_NDIRTY); -#ifndef DISABLE_FILE_SYNC - if (NC_doFsync(ncp)) { /* NC_SHARE is set */ - int mpierr, mpireturn; - if (NC_indep(ncp)) { - TRACE_IO(MPI_File_sync)(ncp->independent_fh); - } - else { - TRACE_IO(MPI_File_sync)(ncp->collective_fh); - } - if (mpireturn != MPI_SUCCESS) { - mpierr = ncmpii_error_mpi2nc(mpireturn, "MPI_File_sync"); - if (status == NC_NOERR) status = mpierr; - } - TRACE_COMM(MPI_Barrier)(ncp->comm); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, "MPI_Barrier"); - } -#endif return status; } diff --git a/src/drivers/ncmpio/ncmpio_vard.c b/src/drivers/ncmpio/ncmpio_vard.c index 1ab9f83a3..9c3702d51 100644 --- a/src/drivers/ncmpio/ncmpio_vard.c +++ b/src/drivers/ncmpio/ncmpio_vard.c @@ -371,12 +371,6 @@ getput_vard(NC *ncp, } } } - - if (NC_doFsync(ncp)) { /* NC_SHARE is set */ - TRACE_IO(MPI_File_sync)(fh); - if (fIsSet(reqMode, NC_REQ_COLL)) - TRACE_COMM(MPI_Barrier)(ncp->comm); - } } if (xbuf != NULL && xbuf != buf) NCI_Free(xbuf); diff --git a/src/drivers/ncmpio/ncmpio_wait.c b/src/drivers/ncmpio/ncmpio_wait.c index 0432a25a8..d1730e572 100644 --- a/src/drivers/ncmpio/ncmpio_wait.c +++ b/src/drivers/ncmpio/ncmpio_wait.c @@ -2105,25 +2105,6 @@ wait_getput(NC *ncp, /* delay numrecs sync until end_indep, redef or close */ } } - - if (NC_doFsync(ncp)) { /* NC_SHARE is set */ - int mpireturn; - if (coll_indep == NC_REQ_INDEP) { - TRACE_IO(MPI_File_sync)(ncp->independent_fh); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_File_sync"); - if (status == NC_NOERR) status = err; - } - } - else { - TRACE_IO(MPI_File_sync)(ncp->collective_fh); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_File_sync"); - if (status == NC_NOERR) status = err; - } - TRACE_COMM(MPI_Barrier)(ncp->comm); - } - } } return status;