diff --git a/.github/workflows/mac_mpich.yml b/.github/workflows/mac_mpich.yml index 8bab7aed4..72ca9c903 100644 --- a/.github/workflows/mac_mpich.yml +++ b/.github/workflows/mac_mpich.yml @@ -142,6 +142,42 @@ jobs: run: | cd ${GITHUB_WORKSPACE} make ptests + - name: Build PnetCDF (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" + export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" + make distclean + rm -rf pnetcdf_output + mkdir -p pnetcdf_output + ./configure --disable-fortran \ + --with-mpi=${GITHUB_WORKSPACE}/MPICH \ + TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output + make -j 8 tests + - name: Print config.log (default configuration) + if: ${{ always() }} + run: | + cat ${GITHUB_WORKSPACE}/config.log + - name: make check (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + make check + - name: Print test log files (default configuration) + if: ${{ always() }} + run: | + cd ${GITHUB_WORKSPACE} + fname=`find src test examples benchmarks -type f -name "*.log"` + for f in $fname ; do \ + bname=`basename $f` ; \ + if test "x$bname" != xconfig.log ; then \ + echo "-------- dump $f ----------------------------" ; \ + cat $f ; \ + fi ; \ + done + - name: make ptests (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + make ptests - name: make distcheck run: | cd ${GITHUB_WORKSPACE} diff --git a/.github/workflows/mac_openmpi.yml b/.github/workflows/mac_openmpi.yml index 7cdf0b2c1..65fcb10be 100644 --- a/.github/workflows/mac_openmpi.yml +++ b/.github/workflows/mac_openmpi.yml @@ -144,6 +144,42 @@ jobs: run: | cd ${GITHUB_WORKSPACE} make ptests + - name: Build PnetCDF (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" + export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" + make distclean + rm -rf pnetcdf_output + mkdir -p pnetcdf_output + ./configure --disable-fortran \ + --with-mpi=${GITHUB_WORKSPACE}/OPENMPI \ + TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output + make -j 8 tests + - name: Print config.log (default configuration) + if: ${{ always() }} + run: | + cat ${GITHUB_WORKSPACE}/config.log + - name: make check (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + make check + - name: Print test log files (default configuration) + if: ${{ always() }} + run: | + cd ${GITHUB_WORKSPACE} + fname=`find src test examples benchmarks -type f -name "*.log"` + for f in $fname ; do \ + bname=`basename $f` ; \ + if test "x$bname" != xconfig.log ; then \ + echo "-------- dump $f ----------------------------" ; \ + cat $f ; \ + fi ; \ + done + - name: make ptests (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + make ptests - name: make distcheck run: | cd ${GITHUB_WORKSPACE} diff --git a/.github/workflows/ubuntu_mpich.yml b/.github/workflows/ubuntu_mpich.yml index 68aae837f..09d626f8f 100644 --- a/.github/workflows/ubuntu_mpich.yml +++ b/.github/workflows/ubuntu_mpich.yml @@ -154,6 +154,39 @@ jobs: run: | cd ${GITHUB_WORKSPACE} make ptests + - name: Build PnetCDF (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" + export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" + make distclean + ./configure --prefix=${GITHUB_WORKSPACE}/PnetCDF \ + --with-mpi=${GITHUB_WORKSPACE}/MPICH + make -j 8 tests + - name: Print config.log (default configuration) + if: ${{ always() }} + run: | + cat ${GITHUB_WORKSPACE}/config.log + - name: make check (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + make check + - name: Print test log files (default configuration) + if: ${{ always() }} + run: | + cd ${GITHUB_WORKSPACE} + fname=`find src test examples benchmarks -type f -name "*.log"` + for f in $fname ; do \ + bname=`basename $f` ; \ + if test "x$bname" != xconfig.log ; then \ + echo "-------- dump $f ----------------------------" ; \ + cat $f ; \ + fi ; \ + done + - name: make ptests (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + make ptests - name: make distcheck run: | cd ${GITHUB_WORKSPACE} diff --git a/.github/workflows/ubuntu_openmpi.yml b/.github/workflows/ubuntu_openmpi.yml index b995f70d1..80f087295 100644 --- a/.github/workflows/ubuntu_openmpi.yml +++ b/.github/workflows/ubuntu_openmpi.yml @@ -150,6 +150,41 @@ jobs: run: | cd ${GITHUB_WORKSPACE} make ptests + - name: Build PnetCDF (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}" + export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}" + make distclean + mkdir -p pnetcdf_output + ./configure --prefix=${GITHUB_WORKSPACE}/PnetCDF \ + --with-mpi=${GITHUB_WORKSPACE}/OPENMPI \ + TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output + make -j 8 tests + - name: Print config.log (default configuration) + if: ${{ always() }} + run: | + cat ${GITHUB_WORKSPACE}/config.log + - name: make check (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + make check + - name: Print test log files (default configuration) + if: ${{ always() }} + run: | + cd ${GITHUB_WORKSPACE} + fname=`find src test examples benchmarks -type f -name "*.log"` + for f in $fname ; do \ + bname=`basename $f` ; \ + if test "x$bname" != xconfig.log ; then \ + echo "-------- dump $f ----------------------------" ; \ + cat $f ; \ + fi ; \ + done + - name: make ptests (default configuration) + run: | + cd ${GITHUB_WORKSPACE} + make ptests - name: make distcheck run: | cd ${GITHUB_WORKSPACE} diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md index f1b3693a6..0ca204b83 100644 --- a/DEVELOPER_NOTES.md +++ b/DEVELOPER_NOTES.md @@ -149,12 +149,17 @@ 10. Generate SHA1 checksums * Run command: ``` - openssl sha1 pnetcdf-1.11.0.tar.gz` + openssl sha1 pnetcdf-1.11.0.tar.gz ``` * Example command-line output: ``` SHA1(pnetcdf-1.11.0.tar.gz)= 495d42f0a41abbd09d276262dce0f7c1c535968a ``` + * Or use SHA 256 + ``` + sha256sum pnetcdf-1.11.0.tar.gz + a18a1a43e6c4fd7ef5827dbe90e9dcf1363b758f513af1f1356ed6c651195a9f pnetcdf-1.11.0.tar.gz + ``` 11. Update PnetCDF Web Page * https://github.com/Parallel-NetCDF/Parallel-NetCDF.github.io * Create a new file of release note Parallel-NetCDF.github.io/Release_notes/1.11.0.md. diff --git a/RELEASE_NOTES b/RELEASE_NOTES index c0e031f48..0007f5d68 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -6,6 +6,44 @@ PnetCDF Release Notes Version _PNETCDF_VERSION_ (_PNETCDF_RELEASE_DATE_) ------------------------------------- +* New feature + + Intra-node aggregation for read requests is added. This is the complement + of the write requests first implemented in version 1.14.0. Now intra-node + aggregation supports both write and read operations. This feature can be + enabled by setting hint "nc_num_aggrs_per_node" to the desired number of + aggregators per compute node. + +* New optimization + + A new internal I/O driver, named "pncio", is added which implements several + strategies for performance improvement. A significant portion of this + driver was developed to improve performance when Lustre is used. It + includes the followings. + * When creating a new file, it try to set the Lustre file striping count + to the number of compute nodes allocated to the MPI communicator passed + to "ncmpi_create()", when I/O hint "striping_factor" is not explicitly + set by the applications. + * It automatically sets a good value for hint "cb_nodes" when it is not + explicitly set by the applications. + +* API deprecated + + "vard" APIs introduced in version 1.6.0 are now deprecated. These are the + API family that take an argument of MPI derived data type describing the + file access layout, which is used as the fileview by the underlying MPI + library. + +* New error code + + "NC_EFSTYPE" indicates an error when an invalid file system type is + detected. + +* New PnetCDF hint + + "nc_pncio" -- To disable or enable the use of the internal "pncio" driver. + Its string value is either "enable" or "disable". The default is "enable". + + +------------------------------------- +Version 1.14.1 (July 31, 2025) +------------------------------------- + * New optimization + When file header extent size grows, moving the data section to a higher file offset has changed to be done in chunks of 16 MB per process. diff --git a/benchmarks/C/Makefile.am b/benchmarks/C/Makefile.am index 333176cbf..e45408dcd 100644 --- a/benchmarks/C/Makefile.am +++ b/benchmarks/C/Makefile.am @@ -40,14 +40,30 @@ CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \ # be used to compare NetCDF4 performance against PnetCDF. EXTRA_DIST = parallel_run.sh netcdf_put_vara.c -ptest ptests ptest4: $(check_PROGRAMS) +ptest ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ $(srcdir)/parallel_run.sh 4 || exit 1 -ptest2 ptest6 ptest8 ptest10: +ptest2: $(check_PROGRAMS) + @echo "===========================================================" + @echo " $(subdir): Parallel testing on 2 MPI processes" + @echo "===========================================================" + @$(TESTS_ENVIRONMENT) \ + $(srcdir)/parallel_run.sh 2 || exit 1 + +ptest10: $(check_PROGRAMS) + @echo "===========================================================" + @echo " $(subdir): Parallel testing on 10 MPI processes" + @echo "===========================================================" + @$(TESTS_ENVIRONMENT) \ + $(srcdir)/parallel_run.sh 10 || exit 1 + +ptest6 ptest8: + +ptests: ptest2 ptest4 ptest10 # build check targets but not invoke tests-local: all $(check_PROGRAMS) diff --git a/benchmarks/C/parallel_run.sh b/benchmarks/C/parallel_run.sh index be9212db8..8f2593030 100755 --- a/benchmarks/C/parallel_run.sh +++ b/benchmarks/C/parallel_run.sh @@ -10,15 +10,18 @@ set -e VALIDATOR=../../src/utils/ncvalidator/ncvalidator NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff -# remove file system type prefix if there is any -OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` - MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" # echo "check_PROGRAMS=${check_PROGRAMS}" +# remove file system type prefix if there is any +OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` + +# let NTHREADS=$1*6-1 +NTHREADS=`expr $1 \* 6 - 1` + # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -27,60 +30,112 @@ fi # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS +fixed_length=23 + for i in ${check_PROGRAMS} ; do + for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" + SAFE_HINTS="romio_no_indep_rw=true" + safe_hint=" SAFE" else - export PNETCDF_HINTS= + SAFE_HINTS="romio_no_indep_rw=false" + safe_hint="NOSAFE" fi + OUT_PREFIX="${TESTOUTDIR}/$i" + + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + DRIVER_OUT_FILE="${OUT_PREFIX}.mpio" + driver_hint=" MPIO" + else + USEMPIO_HINTS="nc_pncio=enable" + DRIVER_OUT_FILE="${OUT_PREFIX}.pncio" + driver_hint="PNCIO" + fi + for intra_aggr in 0 1 ; do if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + INA_HINTS="nc_num_aggrs_per_node=2" + INA_OUT_FILE="${DRIVER_OUT_FILE}.ina" + ina_hint=" INA" + else + INA_HINTS="nc_num_aggrs_per_node=0" + INA_OUT_FILE="${DRIVER_OUT_FILE}" + ina_hint="NOINA" + fi + + OUT_FILE=$INA_OUT_FILE + TEST_OPTS="$safe_hint $driver_hint $ina_hint" + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" - OPTS= + CMD_OPTS="-q -l 10" if test "$i" = "aggregation" ; then - OPTS="-b -c -i -j" + CMD_OPTS+=" -b -c -i -j" fi - # echo "${MPIRUN} ./$i -q ${OPTS} -l 10 ${TESTOUTDIR}/$i.nc" - ${MPIRUN} ./$i -q ${OPTS} -l 10 ${TESTOUTDIR}/$i.nc + + # echo "${LINENO}: ${MPIRUN} ./$i $CMD_OPTS ${OUT_FILE}.nc" + ${MPIRUN} ./$i $CMD_OPTS ${OUT_FILE}.nc + if test $? = 0 ; then - echo "PASS: C parallel run on $1 processes --------------- $i" + printf "PASS: C nprocs=$1 %-${fixed_length}s -------- $i\n" "$TEST_OPTS" fi - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc - # echo "" + # echo "${LINENO}:--- validating file ${OUT_FILE}.nc" + ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - # echo "test burst buffering feature" + # echo "---- test burst buffering feature" saved_PNETCDF_HINTS=${PNETCDF_HINTS} export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./$i -q -l 10 ${TESTOUTDIR}/$i.bb.nc + # echo "${LINENO}: ${MPIRUN} ./$i ${OUT_FILE}.bb.nc" + ${MPIRUN} ./$i $CMD_OPTS ${OUT_FILE}.bb.nc + if test $? = 0 ; then - echo "PASS: C parallel run on $1 processes --------------- $i" + printf "PASS: C nprocs=$1 %-${fixed_length}s -------- $i\n" "$TEST_OPTS BB" fi + export PNETCDF_HINTS=${saved_PNETCDF_HINTS} - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc + # echo "${LINENO}: --- validating file ${OUT_FILE}.bb.nc" + ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc - # echo "--- ncmpidiff $i.nc $i.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc + DIFF_OPT="-q" + # echo "${LINENO}: --- ncmpidiff $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc ---" + ${MPIRUN} ${NCMPIDIFF} $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc fi - if test "x${ENABLE_NETCDF4}" = x1 ; then - # echo "test netCDF-4 feature" - ${MPIRUN} ./$i -q -l 10 ${TESTOUTDIR}/$i.nc4 4 - # Validator does not support nc4 - fi - done - done - rm -f ${OUTDIR}/$i.nc - rm -f ${OUTDIR}/$i.bb.nc - rm -f ${OUTDIR}/$i.nc4 -done + if test "x${ENABLE_NETCDF4}" = x1 ; then + # echo "${LINENO}: test netCDF-4 feature" + ${MPIRUN} ./$i $CMD_OPTS ${OUT_FILE}.nc4 4 + # Validator does not support nc4 + fi + done # intra_aggr + done # mpiio_mode + + DIFF_OPT="-q" + # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc + # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc + # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc + + done # safe_modes + rm -f ${OUTDIR}/$i*nc* +done # check_PROGRAMS diff --git a/benchmarks/C/write_block_read_column.c b/benchmarks/C/write_block_read_column.c index 4c129794c..5048783f4 100644 --- a/benchmarks/C/write_block_read_column.c +++ b/benchmarks/C/write_block_read_column.c @@ -42,6 +42,8 @@ #define NVARS 4 +static int verbose; + #define ERR(e) {if((e)!=NC_NOERR){printf("Error at line=%d: %s\n", __LINE__, ncmpi_strerror(e));nerrs++;}} /*----< print_info() >------------------------------------------------------*/ @@ -124,6 +126,8 @@ int benchmark_write(char *filename, psizes[0] = psizes[1] = 0; MPI_Dims_create(nprocs, 2, psizes); + if (verbose && rank == 0) printf("psizes = %d %d\n",psizes[0],psizes[1]); + gsizes[0] = len * psizes[0]; gsizes[1] = len * psizes[1]; @@ -157,11 +161,14 @@ int benchmark_write(char *filename, timing[2] = end_t - start_t; start_t = end_t; - start[0] = len * (rank % psizes[0]); - start[1] = len * ((rank / psizes[1]) % psizes[1]); + start[0] = len * (rank / psizes[1]); + start[1] = len * (rank % psizes[1]); count[0] = len; count[1] = len; + if (verbose) + printf("%d: start=%lld %lld count=%lld %lld\n",rank,start[0],start[1],count[0],count[1]); + for (i=0; i= ntimes */ vars[i].start[0] = j; if (vars[i].xtype == NC_FLOAT) @@ -913,17 +926,27 @@ int wrf_r_benchmark(char *in_file, printf(" %.2f GiB/s\n", bw/1024.0/max_t[0]); printf("-----------------------------------------------------------\n"); MPI_Info_get(info_used, "striping_factor", MPI_MAX_INFO_VAL, value, &flag); - printf("MPI-IO hint striping_factor: %s\n", value); + printf("MPI-IO hint striping_factor: %s\n", HINT); MPI_Info_get(info_used, "striping_unit", MPI_MAX_INFO_VAL, value, &flag); - printf("MPI-IO hint striping_unit: %s\n", value); + printf("MPI-IO hint striping_unit: %s\n", HINT); MPI_Info_get(info_used, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &flag); - printf("MPI-IO hint cb_buffer_size: %s\n", value); - MPI_Info_get(info_used, "cb_node_list", MPI_MAX_INFO_VAL, value, &flag); - printf("MPI-IO hint cb_node_list: %s\n", value); + printf("MPI-IO hint cb_buffer_size: %s\n", HINT); MPI_Info_get(info_used, "cb_nodes", MPI_MAX_INFO_VAL, value, &flag); - printf("MPI-IO hint cb_nodes: %s\n", value); + printf("MPI-IO hint cb_nodes: %s\n", HINT); + MPI_Info_get(info_used, "cb_config_list", MPI_MAX_INFO_VAL, value, &flag); + printf("MPI-IO hint cb_config_list: %s\n", HINT); + MPI_Info_get(info_used, "cb_node_list", MPI_MAX_INFO_VAL, value, &flag); + printf("MPI-IO hint cb_node_list: %s\n", HINT); + MPI_Info_get(info_used, "nc_pncio", MPI_MAX_INFO_VAL, value, &flag); + printf("PnetCDF hint nc_pncio: %s\n", HINT); MPI_Info_get(info_used, "nc_num_aggrs_per_node",MPI_MAX_INFO_VAL, value, &flag); - printf("PnetCDF hint nc_num_aggrs_per_node: %s\n", value); + printf("PnetCDF hint nc_num_aggrs_per_node: %s\n", HINT); + MPI_Info_get(info_used, "nc_ina_node_list", MPI_MAX_INFO_VAL, value, &flag); + printf("PnetCDF hint nc_ina_node_list: %s\n", HINT); + MPI_Info_get(info_used, "cray_cb_nodes_multiplier", MPI_MAX_INFO_VAL, value, &flag); + printf("Hint cray_cb_nodes_multiplier: %s\n", HINT); + MPI_Info_get(info_used, "cray_cb_write_lock_mode", MPI_MAX_INFO_VAL, value, &flag); + printf("Hint cray_cb_write_lock_mode: %s\n", HINT); printf("-----------------------------------------------------------\n"); } MPI_Info_free(&info_used); diff --git a/configure.ac b/configure.ac index 142612104..8fec505ba 100644 --- a/configure.ac +++ b/configure.ac @@ -15,7 +15,7 @@ dnl AC_REVISION([$Revision$])dnl dnl autoconf v2.70 and later is required. See https://github.com/Parallel-NetCDF/PnetCDF/issues/94 dnl autoconf v2.70 was released in 2021-01-28 AC_PREREQ([2.70]) -AC_INIT([PnetCDF], [1.14.1], +AC_INIT([PnetCDF], [1.15.0-alpha], [parallel-netcdf@mcs.anl.gov], [pnetcdf], [https://parallel-netcdf.github.io]) @@ -69,8 +69,8 @@ AM_EXTRA_RECURSIVE_TARGETS([tests]) dnl parse the version numbers to 4 env variables PNETCDF_VERSION_MAJOR=`echo ${PACKAGE_VERSION} | cut -d. -f1` PNETCDF_VERSION_MINOR=`echo ${PACKAGE_VERSION} | cut -d. -f2` -PNETCDF_VERSION_SUB=`echo ${PACKAGE_VERSION} | cut -d. -f3` -PNETCDF_VERSION_PRE=`echo ${PACKAGE_VERSION} | cut -d. -f4` +PNETCDF_VERSION_SUB=`echo ${PACKAGE_VERSION} | cut -d. -f3 | cut -d'-' -f1` +PNETCDF_VERSION_PRE=`echo ${PACKAGE_VERSION} | cut -d'-' -f2` dnl Note major, minor, and sub are required, but pre is not. PNETCDF_VERSION=${PACKAGE_VERSION} @@ -1175,6 +1175,8 @@ dnl AC_CHECK_FUNCS([memset setlocale sqrt strchr strrchr strtol]) dnl AC_CHECK_LIB([m], [tanh]) dnl UD_CHECK_LIB_MATH +AC_CHECK_HEADERS([unistd.h fcntl.h malloc.h stddef.h sys/types.h limits.h time.h dirent.h]) + dnl When using gcc based compiler with -ansi flag, AC_CHECK_FUNCS can still dnl find strdup, but AC_CHECK_DECL cannot. So we check with AC_CHECK_DECL dnl first and then check AC_CHECK_FUNCS. @@ -1377,8 +1379,11 @@ AC_CHECK_FUNCS([MPI_Type_create_subarray_c \ MPI_Type_get_true_extent_c \ MPI_Type_get_envelope_c \ MPI_Type_get_contents_c \ + MPI_Status_set_elements_x \ MPI_Bcast_c \ MPI_Get_count_c \ + MPI_Isend_c \ + MPI_Irecv_c \ MPI_Pack_c \ MPI_Unpack_c \ MPI_File_read_at_c \ @@ -1459,6 +1464,14 @@ if test "$mpi_version" -ge "3" ; then [], [], [[#include ]]) fi +# check some MPI combiner types that are used internally in PnetCDF +UD_CHECK_MPI_CONSTANTS([MPI_COMBINER_DUP, + MPI_COMBINER_SUBARRAY, + MPI_COMBINER_DARRAY, + MPI_COMBINER_INDEXED_BLOCK, + MPI_COMBINER_HINDEXED_BLOCK], + [], [], [[#include ]]) + dnl Check presence of various MPI error classes. Introduced in MPI 2.0. dnl These could be enums, so we have to do compile checks. dnl AC_CHECK_DECLS([MPI_ERR_FILE_EXISTS, @@ -1521,6 +1534,94 @@ dnl UD_CHECK_MPI_DATATYPE(MPI_REAL8) dnl first defined in MPI 1.0 dnl UD_CHECK_MPI_DATATYPE(MPI_DOUBLE_PRECISION) dnl first defined in MPI 1.0 dnl fi +AC_MSG_CHECKING([whether MPI_Waitall takes MPI_STATUSES_IGNORE]) +if test "x${GCC}" = xyes; then + saved_CFLAGS=${CFLAGS} + CFLAGS="-Werror -Wstringop-overflow=2" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]],[[ + int count; + MPI_Request *reqs; + MPI_Waitall(count, reqs, MPI_STATUSES_IGNORE); + ]])], [MPI_STATUSES_IGNORE=yes], [MPI_STATUSES_IGNORE=no]) + CFLAGS=${saved_CFLAGS} +else + AC_CHECK_DECL([MPI_STATUSES_IGNORE], [MPI_STATUSES_IGNORE=yes], [MPI_STATUSES_IGNORE=no] [[#include ]]) +fi +AC_MSG_RESULT([$MPI_STATUSES_IGNORE]) +if test "x$MPI_STATUSES_IGNORE" = xyes ; then + AC_DEFINE(HAVE_MPI_STATUSES_IGNORE, 1, [Whether MPI_Waitall takes argument MPI_STATUSES_IGNORE]) +fi + +# +# Check for statfs (many) and specifically f_fstypename field (BSD) +# +AC_CHECK_HEADERS(sys/vfs.h sys/param.h sys/mount.h sys/statvfs.h sys/stat.h sys/type.h unistd.h) + +AC_CHECK_FUNCS([statvfs statfs stat]) + +AC_CHECK_MEMBERS([struct statvfs.f_basetype, + struct statfs.f_fstypename, + struct statfs.f_type, + struct stat.st_fstype],[],[], + AC_INCLUDES_DEFAULT + [#ifdef HAVE_SYS_VFS_H + #include + #endif + #ifdef HAVE_SYS_PARAM_H + #include + #endif + #ifdef HAVE_SYS_MOUNT_H + #include + #endif + #ifdef HAVE_SYS_STATFS_H + #include + #endif + #ifdef HAVE_SYS_STAT_H + #include + #endif + #ifdef HAVE_SYS_TYPE_H + #include + #endif + #ifdef HAVE_UNISTD_H + #include + #endif + ]) + +AC_CHECK_TYPE([blksize_t],[],[AC_DEFINE_UNQUOTED([blksize_t],[__blksize_t],[Provide blksize_t if not available]) ], [[ + #ifdef HAVE_SYS_TYPES_H + #include + #endif + #ifdef HAVE_SYS_STAT_H + #include + #endif + #ifdef HAVE_UNISTD_H + #include + #endif]] ) + +AC_CHECK_DECLS([pwrite]) + +# +# Check if Lustre is available by verifying presence of lustre/lustre_user.h +# +has_lustre=no +AC_CHECK_HEADERS([lustre/lustre_user.h linux/lustre/lustre_user.h], + [has_lustre=yes ; break]) +if test "x$has_lustre" = xyes ; then + AC_DEFINE(HAVE_LUSTRE, 1, [Define for LUSTRE]) + LIBS="$LIBS -llustreapi" +fi +AM_CONDITIONAL(HAVE_LUSTRE, [test x$has_lustre = xyes]) + +if test "x$has_lustre" = xno ; then + AC_MSG_CHECKING([for whether mimicking Lustre]) + minicking_lustre=no + if test "x$MIMIC_LUSTRE" = xyes ; then + AC_DEFINE(MIMIC_LUSTRE, 1, [Define for mimicking LUSTRE file system]) + minicking_lustre=yes + fi + AC_MSG_RESULT($minicking_lustre) +fi + AC_C_CHAR_UNSIGNED AC_C_BIGENDIAN AM_CONDITIONAL(IS_BIGENDIAN, [test x$ac_cv_c_bigendian = xyes]) @@ -1739,8 +1840,15 @@ if test "x${debug}" = xyes; then if test "x$?" != x0 ; then CFLAGS="$CFLAGS -g" fi - CFLAGS=`echo $CFLAGS | ${SED} 's/-O. *//g' | ${SED} 's/-fast *//g'` - CFLAGS="$CFLAGS -O0" + + # remove -fast if set by user + CFLAGS=`echo $CFLAGS | ${SED} 's/-fast *//g'` + + # check if -O is set by user, if not, then add -O0 + str_found=`echo "${CFLAGS}" | ${EGREP} -- "-O"` + if test "x$str_found" = x ; then + CFLAGS="$CFLAGS -O0" + fi if test "x${has_mpicxx}" = xyes ; then str_found=`echo "${CXXFLAGS}" | ${EGREP} -- "-g"` @@ -2603,10 +2711,10 @@ else # no name prefix end with ':' FSTYPE_PREFIX= else - # check if name prefix is one of file system types known to ROMIO - romio_known_fstypes=(ufs nfs xfs pvfs2 gpfs panfs lustre daos testfs ime quobyte) + # check if name prefix is one of file system types known to PNCIO + known_fstypes=(ufs nfs xfs pvfs2 gpfs panfs lustre daos testfs ime quobyte) known_fstype= - for pre in $romio_known_fstypes ; do + for pre in $known_fstypes ; do if test "$FSTYPE_PREFIX" = $pre ; then known_fstype=$pre break @@ -2691,7 +2799,7 @@ dnl Update the version information only immediately before a public release. dnl PnetCDF starts with 1:0:0 (shared library is first supported in 1.9.0) dnl because some package distributors, such as Debian, may have already built dnl PnetCDF with shared libraries. -ABIVERSION="7:0:0" +ABIVERSION="8:0:1" AC_SUBST(ABIVERSION) if test "$enable_versioning" = "yes" ; then ABIVERSIONFLAGS="-version-info \$(ABIVERSION)" @@ -2711,6 +2819,7 @@ AC_CONFIG_FILES(Makefile \ src/drivers/common/Makefile \ src/drivers/include/Makefile \ src/drivers/ncmpio/Makefile \ + src/drivers/pncio/Makefile \ src/drivers/nc4io/Makefile \ src/drivers/ncadios/Makefile \ src/drivers/ncbbio/Makefile \ diff --git a/examples/C/Makefile.am b/examples/C/Makefile.am index 009095a77..c1b881ebc 100644 --- a/examples/C/Makefile.am +++ b/examples/C/Makefile.am @@ -73,28 +73,28 @@ NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \ CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \ $(NC_FILES) $(TESTOUTDIR)/pthread.nc.* $(TESTOUTDIR)/testfile.nc -EXTRA_DIST = parallel_run.sh run_c_examples.sh cdl_header.txt +EXTRA_DIST = run_c_examples.sh cdl_header.txt ptest ptest4: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 "C " || exit 1 ptest8: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 8 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 8 || exit 1 + $(srcdir)/../parallel_run.sh 8 "C " || exit 1 ptest3: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 3 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 3 || exit 1 + $(srcdir)/../parallel_run.sh 3 "C " || exit 1 ptests: ptest3 ptest4 ptest8 ptest2 ptest6 ptest10: diff --git a/examples/C/create_from_cdl.c b/examples/C/create_from_cdl.c index a2a2ae179..2a238153d 100644 --- a/examples/C/create_from_cdl.c +++ b/examples/C/create_from_cdl.c @@ -168,13 +168,16 @@ int main(int argc, char **argv) err = ncmpi_def_var(ncid, name, xtype, ndims, dimids, &varid); CHECK_ERR + /* fill with default fill value */ + err = ncmpi_def_var_fill(ncid, varid, 0, NULL); + CHECK_ERR + /* retrieve metadata of attribute j associated with variable i */ err = cdl_hdr_inq_nattrs(hid, i, &nattrs); CHECK_ERR for (j=0; j 0) { /* non-scalar attribute */ /* note xsz is aligned, thus must use the exact size of buf */ - int rank, itype_size; + int itype_size; size_t buf_size; void *root_buf; - MPI_Comm_rank(comm, &rank); - /* for attributes, itype is nc_type, so its size is small. Thus, no * need to check against NC_MAX_INT. */ diff --git a/src/dispatchers/cdl_header_parser.c b/src/dispatchers/cdl_header_parser.c index 421999e85..d5efc2e78 100644 --- a/src/dispatchers/cdl_header_parser.c +++ b/src/dispatchers/cdl_header_parser.c @@ -720,7 +720,7 @@ int cdl_hdr_open(const char *filename, return NC_EFILE; } rlen = fread(fbuf, 1, file_size, fptr); - if (rlen < 0) { + if (file_size > 0 && rlen == 0) { printf("Error in %s at %d: fail to fread file %s (%s)\n", __func__,__LINE__,filename,strerror(errno)); return NC_EFILE; diff --git a/src/dispatchers/error_codes.c b/src/dispatchers/error_codes.c index 1a71538f0..e74a20d48 100644 --- a/src/dispatchers/error_codes.c +++ b/src/dispatchers/error_codes.c @@ -295,6 +295,8 @@ ncmpi_strerror(int err) return "Nonblocking requests already flushed."; case NC_EADIOS: return "unknown ADIOS error."; + case NC_EFSTYPE: + return "Invalid file system type."; default: /* check netCDF-3 and netCDF-4 errors */ @@ -719,6 +721,7 @@ ncmpi_strerrno(int err) case (NC_EBADLOG): return "NC_EBADLOG"; case (NC_EFLUSHED): return "NC_EFLUSHED"; case (NC_EADIOS): return "NC_EADIOS"; + case (NC_EFSTYPE): return "NC_EFSTYPE"; case (NC_EMULTIDEFINE): return "NC_EMULTIDEFINE"; case (NC_EMULTIDEFINE_OMODE): return "NC_EMULTIDEFINE_OMODE"; diff --git a/src/dispatchers/file.c b/src/dispatchers/file.c index b68782038..b7805d806 100644 --- a/src/dispatchers/file.c +++ b/src/dispatchers/file.c @@ -495,31 +495,26 @@ ncmpi_create(MPI_Comm comm, else pncp->comm = comm; + /* fill in pncp members */ + pncp->path = (char*) NCI_Strdup(path); + if (pncp->path == NULL) + DEBUG_RETURN_ERROR(NC_ENOMEM) + /* calling the driver's create subroutine */ - err = driver->create(pncp->comm, path, cmode, *ncidp, combined_info, &ncp); + err = driver->create(pncp->comm, pncp->path, cmode, *ncidp, combined_info, + &ncp); if (status == NC_NOERR) status = err; if (combined_info != MPI_INFO_NULL) MPI_Info_free(&combined_info); if (status != NC_NOERR && status != NC_EMULTIDEFINE_CMODE) { del_from_PNCList(*ncidp); if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF) MPI_Comm_free(&pncp->comm); /* a collective call */ + NCI_Free(pncp->path); NCI_Free(pncp); *ncidp = -1; return status; } - /* fill in pncp members */ - pncp->path = (char*) NCI_Malloc(strlen(path)+1); - if (pncp->path == NULL) { - driver->close(ncp); /* close file and ignore error */ - del_from_PNCList(*ncidp); - if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF) - MPI_Comm_free(&pncp->comm); /* a collective call */ - NCI_Free(pncp); - *ncidp = -1; - DEBUG_RETURN_ERROR(NC_ENOMEM) - } - strcpy(pncp->path, path); pncp->mode = cmode; pncp->driver = driver; pncp->ndims = 0; @@ -759,8 +754,13 @@ ncmpi_open(MPI_Comm comm, else pncp->comm = comm; + pncp->path = (char*) NCI_Strdup(path); + if (pncp->path == NULL) + DEBUG_RETURN_ERROR(NC_ENOMEM) + /* calling the driver's open subroutine */ - err = driver->open(pncp->comm, path, omode, *ncidp, combined_info, &ncp); + err = driver->open(pncp->comm, pncp->path, omode, *ncidp, combined_info, + &ncp); if (status == NC_NOERR) status = err; if (combined_info != MPI_INFO_NULL) MPI_Info_free(&combined_info); if (status != NC_NOERR && status != NC_EMULTIDEFINE_OMODE && @@ -770,23 +770,13 @@ ncmpi_open(MPI_Comm comm, del_from_PNCList(*ncidp); if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF) MPI_Comm_free(&pncp->comm); /* a collective call */ + NCI_Free(pncp->path); NCI_Free(pncp); *ncidp = -1; return status; } /* fill in pncp members */ - pncp->path = (char*) NCI_Malloc(strlen(path)+1); - if (pncp->path == NULL) { - driver->close(ncp); /* close file and ignore error */ - del_from_PNCList(*ncidp); - if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF) - MPI_Comm_free(&pncp->comm); /* a collective call */ - NCI_Free(pncp); - *ncidp = -1; - DEBUG_RETURN_ERROR(NC_ENOMEM) - } - strcpy(pncp->path, path); pncp->mode = omode; pncp->driver = driver; pncp->ndims = 0; @@ -1251,9 +1241,8 @@ ncmpi_inq_file_format(const char *filename, __func__,__LINE__,filename); DEBUG_RETURN_ERROR(NC_EFILE) } - if (close(fd) == -1) { + if (close(fd) == -1) DEBUG_RETURN_ERROR(NC_EFILE) - } if (memcmp(signature, cdf_signature, 3) == 0) { if (signature[3] == 5) *formatp = NC_FORMAT_CDF5; diff --git a/src/drivers/Makefile.am b/src/drivers/Makefile.am index 3749fcd99..de1a6a092 100644 --- a/src/drivers/Makefile.am +++ b/src/drivers/Makefile.am @@ -6,7 +6,7 @@ # # @configure_input@ -SUBDIRS = include common ncmpio +SUBDIRS = include common ncmpio pncio if BUILD_DRIVER_FOO SUBDIRS += ncfoo @@ -24,7 +24,7 @@ if ENABLE_ADIOS SUBDIRS += ncadios endif -DIST_SUBDIRS = include common ncmpio ncfoo ncbbio nc4io ncadios +DIST_SUBDIRS = include common ncmpio ncfoo ncbbio nc4io ncadios pncio # For VPATH build (parallel build), try delete all sub-directories distclean-local: diff --git a/src/drivers/common/mem_alloc.c b/src/drivers/common/mem_alloc.c index 279dd44b4..7def10c6c 100644 --- a/src/drivers/common/mem_alloc.c +++ b/src/drivers/common/mem_alloc.c @@ -9,13 +9,15 @@ NCI_Malloc(size) NCI_Calloc(nelems, esize) NCI_Realloc(ptr, size) + NCI_Strdup(ptr) NCI_Free(ptr) In macro.h, they are macro-replaced to - NCI_Malloc_fn(size, __LINE__, __FILE__) and - NCI_Calloc_fn(nelems, esize, __LINE__, __FILE__) and + NCI_Malloc_fn(size, __LINE__, __func__, __FILE__) + NCI_Calloc_fn(nelems, esize, __LINE__, __func__, __FILE__) NCI_Realloc_fn(ptr, size, __LINE__, __func__, __FILE__) - NCI_Free_fn(ptr,__LINE__,__FILE__). + NCI_Strdup_fn(ptr, __LINE__, __func__, __FILE__) + NCI_Free_fn(ptr, __LINE__, __func__, __FILE__). */ #ifdef HAVE_CONFIG_H diff --git a/src/drivers/common/utils.c b/src/drivers/common/utils.c index e60b6e30a..ffac366ac 100644 --- a/src/drivers/common/utils.c +++ b/src/drivers/common/utils.c @@ -61,7 +61,7 @@ ncmpii_xlen_nc_type(nc_type xtype, int *size) } } -/* File system types recognized by ROMIO in MPICH 4.0.0 */ +/* File system types recognized by ROMIO in MPICH 4.0.0, and by PnetCDF */ static const char* fstypes[] = {"ufs", "nfs", "xfs", "pvfs2", "gpfs", "panfs", "lustre", "daos", "testfs", "ime", "quobyte", NULL}; /* Return a pointer to filename by removing the file system type prefix name if @@ -91,3 +91,152 @@ char* ncmpii_remove_file_system_type_prefix(const char *filename) return ret_filename; } +/*----< ncmpii_construct_node_list() >---------------------------------------*/ +/* This subroutine is a collective call. It finds the affinity of each MPI + * process to the compute node and returns the followings: + * num_nodes_ptr Number of unique nodes (host names) + * node_ids_ptr [nprocs] node IDs of each rank, must be freed by caller. + */ +int +ncmpii_construct_node_list(MPI_Comm comm, + int *num_nodes_ptr, /* OUT: */ + int **node_ids_ptr) /* OUT: [nprocs] */ +{ + char my_procname[MPI_MAX_PROCESSOR_NAME], **all_procnames=NULL; + int i, j, k, rank, nprocs, num_nodes, my_procname_len, root=0; + int *node_ids=NULL, *all_procname_lens=NULL; + + MPI_Comm_size(comm, &nprocs); + MPI_Comm_rank(comm, &rank); + + /* Collect host name of alocated compute nodes. Note my_procname is null + * character terminated, but my_procname_len does not include the null + * character. + */ + MPI_Get_processor_name(my_procname, &my_procname_len); +#if 0 +#ifdef MIMIC_LUSTRE +#define MIMIC_NUM_NODES 1 + /* mimic number of compute nodes = MIMIC_NUM_NODES */ + int node_id, np_per_node = nprocs / MIMIC_NUM_NODES; + if (nprocs % MIMIC_NUM_NODES > 0) np_per_node++; + if (rank < np_per_node * (nprocs % MIMIC_NUM_NODES)) + node_id = rank / np_per_node; + else + node_id = (rank - np_per_node * (nprocs % MIMIC_NUM_NODES)) / (nprocs / MIMIC_NUM_NODES) + (nprocs % MIMIC_NUM_NODES); + + sprintf(my_procname,"compute.node.%d", node_id); + my_procname_len = (int)strlen(my_procname); +#endif +#endif + + my_procname_len++; /* to include terminate null character */ + + if (rank == root) { + /* root collects all procnames */ + all_procnames = (char **) NCI_Malloc(sizeof(char*) * nprocs); + if (all_procnames == NULL) + DEBUG_RETURN_ERROR(NC_ENOMEM) + + all_procname_lens = (int *) NCI_Malloc(sizeof(int) * nprocs); + if (all_procname_lens == NULL) { + NCI_Free(all_procnames); + DEBUG_RETURN_ERROR(NC_ENOMEM) + } + } + /* gather process name lengths from all processes first */ + MPI_Gather(&my_procname_len, 1, MPI_INT, all_procname_lens, 1, MPI_INT, + root, comm); + + if (rank == root) { + int *disp; + size_t alloc_size = 0; + + for (i=0; icomm); + err = ncbbp->ncmpio_driver->begin_indep_data(ncbbp->ncp); if (err != NC_NOERR) return err; diff --git a/src/drivers/ncmpio/Makefile.am b/src/drivers/ncmpio/Makefile.am index c1afe76c1..9cbd78e13 100644 --- a/src/drivers/ncmpio/Makefile.am +++ b/src/drivers/ncmpio/Makefile.am @@ -12,6 +12,7 @@ AM_CPPFLAGS = -I${top_srcdir}/src/include AM_CPPFLAGS += -I${top_builddir}/src/include AM_CPPFLAGS += -I${top_srcdir}/src/drivers/include AM_CPPFLAGS += -I${top_builddir}/src/drivers/include +AM_CPPFLAGS += -I${top_srcdir}/src/drivers/pncio if PNETCDF_DEBUG AM_CPPFLAGS += -DPNETCDF_DEBUG diff --git a/src/drivers/ncmpio/ncmpio_NC.h b/src/drivers/ncmpio/ncmpio_NC.h index 0e2c71e81..f64567ab0 100644 --- a/src/drivers/ncmpio/ncmpio_NC.h +++ b/src/drivers/ncmpio/ncmpio_NC.h @@ -16,6 +16,7 @@ #include #include "ncmpio_driver.h" +#include "pncio.h" #define NC_DEFAULT_H_MINFREE 0 #define NC_DEFAULT_V_ALIGN 512 @@ -156,7 +157,7 @@ typedef struct { * specifications can be of type 8-byte integers. */ typedef struct NC_dimarray { - int ndefined; /* number of defined dimensions */ + int ndefined; /* no. defined dimensions */ int unlimited_id; /* -1 for not defined, otherwise >= 0 */ NC_dim **value; int hash_size; @@ -180,7 +181,7 @@ ncmpio_dup_NC_dimarray(NC_dimarray *ncap, const NC_dimarray *ref); * NC attribute */ typedef struct { - MPI_Offset nelems; /* number of attribute elements */ + MPI_Offset nelems; /* no. attribute elements */ MPI_Offset xsz; /* amount of space at xvalue (4-byte aligned) */ nc_type xtype; /* external NC data type of the attribute */ size_t name_len; /* strlen(name) for faster string compare */ @@ -199,7 +200,7 @@ typedef struct { * specifications can be of type 8-byte integers. */ typedef struct NC_attrarray { - int ndefined; /* number of defined attributes */ + int ndefined; /* no. defined attributes */ NC_attr **value; int hash_size; NC_nametable *nameT; @@ -238,7 +239,7 @@ typedef struct { int no_fill; /* whether fill mode is disabled */ size_t name_len;/* strlen(name) for faster string compare */ char *name; /* name of the variable */ - int ndims; /* number of dimensions */ + int ndims; /* no. dimensions */ int *dimids; /* [ndims] array of dimension IDs */ MPI_Offset *shape; /* [ndims] dim->size of each dim shape[0] == NC_UNLIMITED if record variable */ @@ -268,8 +269,8 @@ typedef struct { */ /* note: we only allow less than 2^31-1 variables defined in a file */ typedef struct NC_vararray { - int ndefined; /* number of defined variables */ - int num_rec_vars;/* number of defined record variables */ + int ndefined; /* no. defined variables */ + int num_rec_vars;/* no. defined record variables */ NC_var **value; int hash_size; NC_nametable *nameT; @@ -319,15 +320,15 @@ typedef struct NC_lead_req { int flag; /* bit-wise OR of the above NC_REQ_* flags */ int id; /* even number for write, odd for read */ int nonlead_off; /* start index in the non-lead queue */ - int nonlead_num; /* number of non-lead requests */ + int nonlead_num; /* no. non-lead requests */ int abuf_index; /* index in the abuf occupy_table. -1 means not using attached buffer */ void *buf; /* user buffer */ void *xbuf; /* buffer in external type, may be == buf */ NC_var *varp; /* pointer to NC variable object */ - MPI_Offset nelems; /* total number of array elements requested */ + MPI_Offset nelems; /* total no. array elements requested */ MPI_Offset max_rec; /* highest record requested */ - MPI_Offset bufcount; /* number of buftype in this request */ + MPI_Offset bufcount; /* no. buftype in this request */ MPI_Offset *start; /* [varp->ndims*3] for start/count/stride */ MPI_Datatype buftype; /* user defined derived data type */ MPI_Datatype itype; /* internal element data type in buftype */ @@ -338,10 +339,11 @@ typedef struct NC_lead_req { typedef struct NC_req { MPI_Offset offset_start; /* starting offset of aggregate access region */ MPI_Offset offset_end; /* ending offset of aggregate access region */ - MPI_Offset nelems; /* number of array elements requested */ + MPI_Offset nelems; /* no. array elements requested */ MPI_Offset *start; /* [varp->ndims*3] for start/count/stride */ void *xbuf; /* buffer in external type, used in file I/O calls */ int lead_off; /* start index in the lead queue */ + MPI_Aint npairs; /* no. flattened offset-length pairs */ } NC_req; #define NC_ABUF_DEFAULT_TABLE_SIZE 128 @@ -382,11 +384,10 @@ struct NC { int safe_mode; /* 0 or 1, for parameter consistency check */ #ifdef ENABLE_SUBFILING int subfile_mode; /* 0 or 1, for disable/enable subfiling */ - int num_subfiles; /* number of subfiles */ + int num_subfiles; /* no. subfiles */ struct NC *ncp_sf; /* ncp of subfile */ MPI_Comm comm_sf; /* subfile MPI communicator */ #endif - int striping_unit; /* stripe size of the file */ int chunk; /* chunk size for reading header, one chunk at a time */ MPI_Offset v_align; /* alignment of the beginning of fixed-size variables */ MPI_Offset r_align; /* file alignment for record variable section */ @@ -407,16 +408,20 @@ struct NC { MPI_Offset recsize; /* length of 'record': sum of single record sizes of all the record variables */ - MPI_Offset numrecs; /* number of 'records' allocated */ + MPI_Offset numrecs; /* no. 'records' allocated */ MPI_Offset put_size; /* amount of writes committed so far in bytes */ MPI_Offset get_size; /* amount of reads committed so far in bytes */ MPI_Comm comm; /* MPI communicator */ int rank; /* MPI rank of this process */ - int nprocs; /* number of MPI processes */ + int nprocs; /* no. MPI processes */ + int num_nodes; /* no. unique compute nodes allocated */ + int *node_ids; /* [nprocs] node IDs of each rank */ MPI_Info mpiinfo; /* used MPI info object */ - MPI_File collective_fh; /* file handle for collective mode */ - MPI_File independent_fh; /* file handle for independent mode */ + MPI_File collective_fh; /* MPI-IO file handle for collective mode */ + MPI_File independent_fh; /* MPI-IO file handle for independent mode */ + PNCIO_File *pncio_fh; /* PNCIO file handler */ + int fstype; /* file system type: PNCIO_LUSTRE, PNCIO_UFS */ NC_dimarray dims; /* dimensions defined */ NC_attrarray attrs; /* global attributes defined */ @@ -426,36 +431,55 @@ struct NC { int maxGetReqID; /* max get request ID */ int maxPutReqID; /* max put request ID */ - int numLeadGetReqs; /* number of pending lead get requests */ - int numLeadPutReqs; /* number of pending lead put requests */ + int numLeadGetReqs; /* no. pending lead get requests */ + int numLeadPutReqs; /* no. pending lead put requests */ NC_lead_req *get_lead_list; /* list of lead nonblocking read requests */ NC_lead_req *put_lead_list; /* list of lead nonblocking write requests */ - int numGetReqs; /* number of pending nonblocking get requests */ - int numPutReqs; /* number of pending nonblocking put requests */ + int numGetReqs; /* no. pending nonblocking get requests */ + int numPutReqs; /* no. pending nonblocking put requests */ NC_req *get_list; /* list of nonblocking read requests */ NC_req *put_list; /* list of nonblocking write requests */ NC_buf *abuf; /* attached buffer, used by bput APIs */ - char *path; /* file name */ + const char *path; /* file name */ struct NC *old; /* contains the previous NC during redef. */ - /* Below are used for intra-node aggregation */ - int num_aggrs_per_node; /* number of aggregators per compute node. Set - through a user hint. 0 to disable the - intra-node aggregation, -1 to let PnetCDF to - decide. This value must be the same among all - processes. + /* Below are used for intra-node aggregation (INA) */ + MPI_Comm ina_comm; /* communicator of only intra-node aggregators */ + int ina_nprocs;/* no. processes in intra-node communicator */ + int ina_rank; /* rank ID in intra-node communicator */ + int num_aggrs_per_node; /* no. aggregators per compute node. Set through a + * user hint. 0 to disable the intra-node + * aggregation, -1 to let PnetCDF to decide.This + * value must be the same among all processes. */ int my_aggr; /* rank ID of my aggregator */ - int num_nonaggrs; /* number of non-aggregators assigned */ + int num_nonaggrs; /* no. non-aggregators assigned */ int *nonaggr_ranks; /* ranks of assigned non-aggregators */ + int *ina_node_list; /* rank IDs of INA aggregators */ + #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - double aggr_time; + double ina_time_init; + double ina_time_flatten; + double ina_time_put[5]; + double ina_time_get[5]; + size_t ina_npairs_put; + size_t ina_npairs_get; + size_t maxmem_put[6]; + size_t maxmem_get[6]; #endif }; +typedef struct bufferinfo { + NC *ncp; + MPI_Offset offset; /* current read/write offset in the file */ + char *base; /* beginning of read/write buffer */ + char *pos; /* current position in buffer */ + char *end; /* end position of buffer */ +} bufferinfo; + #define NC_readonly(ncp) fIsSet((ncp)->flags, NC_MODE_RDONLY) #define NC_IsNew(ncp) fIsSet((ncp)->flags, NC_MODE_CREATE) #define NC_indef(ncp) fIsSet((ncp)->flags, NC_MODE_DEF) @@ -474,9 +498,6 @@ struct NC { (NC_EMULTIDEFINE_FIRST >= (err) && (err) >= NC_EMULTIDEFINE_LAST) /* Begin defined in nc.c ----------------------------------------------------*/ -extern void -ncmpio_free_NC(NC *ncp); - extern int ncmpio_NC_check_vlen(NC_var *varp, MPI_Offset vlen_max); @@ -487,20 +508,6 @@ extern int ncmpio_NC_check_voffs(NC *ncp); /* Begin defined in ncmpio_header_get.c -------------------------------------*/ -typedef struct bufferinfo { - MPI_Comm comm; - MPI_File collective_fh; - MPI_Offset get_size; /* amount of file read n bytes so far */ - MPI_Offset offset; /* current read/write offset in the file */ - int chunk; /* chunk size for reading the header */ - int version; /* 1, 2, and 5 for CDF-1, 2, and 5 respectively */ - int safe_mode;/* 0: disabled, 1: enabled */ - int coll_mode;/* 0: independent, 1: collective */ - char *base; /* beginning of read/write buffer */ - char *pos; /* current position in buffer */ - char *end; /* end position of buffer */ -} bufferinfo; - extern MPI_Offset ncmpio_hdr_len_NC(const NC *ncp); @@ -515,9 +522,6 @@ extern int ncmpio_write_header(NC *ncp); /* Begin defined in ncmpio_sync.c -------------------------------------------*/ -extern int -ncmpio_file_sync(NC *ncp); - extern int ncmpio_write_numrecs(NC *ncp, MPI_Offset new_numrecs); @@ -528,10 +532,6 @@ ncmpio_filetype_create_vars(const NC* ncp, const NC_var* varp, const MPI_Offset stride[], MPI_Offset *offset, MPI_Datatype *filetype, int *is_filetype_contig); -extern int -ncmpio_file_set_view(const NC *ncp, MPI_File fh, MPI_Offset *offset, - MPI_Datatype filetype); - /* Begin defined in ncmpio_igetput.m4 ---------------------------------------*/ extern int ncmpio_abuf_malloc(NC *ncp, MPI_Offset nbytes, void **buf, int *abuf_index); @@ -607,17 +607,16 @@ ncmpio_inq_var_fill(NC_var *varp, void *fill_value); extern int ncmpio_fill_vars(NC *ncp); -/* Begin defined in ncmpio_nonblocking.c ------------------------------------*/ -extern int -ncmpio_getput_zero_req(NC *ncp, int rw_flag); - -/* Begin defined in ncmpio_close.c */ -extern int -ncmpio_close_files(NC *ncp, int doUnlink); +/* Begin defined in ncmpio_close.c ------------------------------------------*/ +extern void +ncmpio_free_NC(NC *ncp); /* Begin defined in ncmpio_utils.c ------------------------------------------*/ extern void -ncmpio_set_pnetcdf_hints(NC *ncp, MPI_Info user_info, MPI_Info info_used); +ncmpio_hint_extract(NC *ncp, MPI_Info info); + +extern void +ncmpio_hint_set(NC *ncp, MPI_Info info); extern int ncmpio_NC_check_name(const char *name, int file_ver); @@ -644,23 +643,73 @@ ncmpio_unpack_xbuf(int format, NC_var *varp, MPI_Offset bufcount, MPI_Datatype etype, MPI_Datatype imaptype, int need_convert, int need_swap, void *buf, void *xbuf); +extern int +ncmpio_calc_off(const NC *ncp, const NC_var *varp, const MPI_Offset *start, + MPI_Offset *offset); + +extern int +ncmpio_calc_start_end(const NC *ncp, const NC_var *varp, + const MPI_Offset *start, const MPI_Offset *count, + const MPI_Offset *stride, MPI_Offset *start_off, + MPI_Offset *end_off); + /* Begin defined in ncmpio_file_io.c ----------------------------------------*/ +extern MPI_Offset +ncmpio_file_read_at(NC *ncp, MPI_Offset offset, void *buf, + PNCIO_View buf_view); + +extern MPI_Offset +ncmpio_file_read_at_all(NC *ncp, MPI_Offset offset, void *buf, + PNCIO_View buf_view); + +extern MPI_Offset +ncmpio_file_write_at(NC *ncp, MPI_Offset offset, const void *buf, + PNCIO_View buf_view); + +extern MPI_Offset +ncmpio_file_write_at_all(NC *ncp, MPI_Offset offset, const void *buf, + PNCIO_View buf_view); + +extern int +ncmpio_getput_zero_req(NC *ncp, int rw_flag); + +extern int +ncmpio_read_write(NC *ncp, int rw_flag, MPI_Offset offset, + PNCIO_View flat_btype, void *buf); + +extern int +ncmpio_file_close(NC *ncp); + +extern int +ncmpio_file_delete(NC *ncp); + +extern int +ncmpio_file_sync(NC *ncp); + +extern int +ncmpio_file_set_view(const NC *ncp, MPI_Offset disp, MPI_Datatype filetype, + MPI_Aint npairs, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *offsets, MPI_Count *lengths +#else + MPI_Offset *offsets, int *lengths +#endif +); + extern int -ncmpio_read_write(NC *ncp, int rw_flag, int coll_indep, MPI_Offset offset, - MPI_Offset buf_count, MPI_Datatype buf_type, void *buf, - int buftype_is_contig); +ncmpio_file_open(NC *ncp, MPI_Comm comm, const char *path, int omode, + MPI_Info info); /* Begin defined in ncmpio_intranode.c --------------------------------------*/ extern int -ncmpio_intra_node_aggr_init(NC *ncp); +ncmpio_ina_init(NC *ncp); extern int -ncmpio_intra_node_aggregation_nreqs(NC *ncp, int mode, int num_reqs, - NC_req *put_list, MPI_Offset newnumrecs); +ncmpio_ina_nreqs(NC *ncp, int mode, int num_reqs, NC_req *put_list, + MPI_Offset newnumrecs); extern int -ncmpio_intra_node_aggregation(NC *ncp, int mode, NC_var *varp, - const MPI_Offset *start, const MPI_Offset *count, - const MPI_Offset *stride, MPI_Offset bufCount, - MPI_Datatype bufType, void *buf); +ncmpio_ina_req(NC *ncp, int mode, NC_var *varp, const MPI_Offset *start, + const MPI_Offset *count, const MPI_Offset *stride, + MPI_Offset nbytes, void *buf); #endif /* H_NC */ diff --git a/src/drivers/ncmpio/ncmpio_close.c b/src/drivers/ncmpio/ncmpio_close.c index ec79088e8..cf8553c63 100644 --- a/src/drivers/ncmpio/ncmpio_close.c +++ b/src/drivers/ncmpio/ncmpio_close.c @@ -51,46 +51,11 @@ ncmpio_free_NC(NC *ncp) if (ncp->get_list != NULL) NCI_Free(ncp->get_list); if (ncp->put_list != NULL) NCI_Free(ncp->put_list); if (ncp->abuf != NULL) NCI_Free(ncp->abuf); - if (ncp->path != NULL) NCI_Free(ncp->path); if (ncp->nonaggr_ranks != NULL) NCI_Free(ncp->nonaggr_ranks); NCI_Free(ncp); } -/*----< ncmpio_close_files() >-----------------------------------------------*/ -int -ncmpio_close_files(NC *ncp, int doUnlink) { - char *mpi_name; - int mpireturn; - - assert(ncp != NULL); /* this should never occur */ - - if (ncp->independent_fh != MPI_FILE_NULL) { - TRACE_IO(MPI_File_close, (&ncp->independent_fh)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - } - - if (ncp->nprocs > 1 && ncp->collective_fh != MPI_FILE_NULL) { - TRACE_IO(MPI_File_close, (&ncp->collective_fh)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - } - - if (doUnlink) { - /* called from ncmpi_abort, if the file is being created and is still - * in define mode, the file is deleted */ - if (ncp->rank == 0) { - TRACE_IO(MPI_File_delete, ((char *)ncp->path, ncp->mpiinfo)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - } - if (ncp->nprocs > 1) - MPI_Barrier(ncp->comm); - } - return NC_NOERR; -} - /*----< ncmpio_close() >------------------------------------------------------*/ /* This function is collective */ int @@ -159,8 +124,69 @@ ncmpio_close(void *ncdp) } #endif - /* calling MPI_File_close() */ - err = ncmpio_close_files(ncp, 0); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + int i, j, ntimers; + double tt[16], max_t[16], put_time=0, get_time=0; + MPI_Offset sizes[16], max_sizes[16], max_npairs_put=0, max_npairs_get=0; + + /* print intra-node aggregation timing breakdown */ + if (ncp->num_aggrs_per_node > 0) { + j = 0; + for (i=0; i<6; i++) sizes[j++] = ncp->maxmem_put[i]; + for (i=0; i<6; i++) sizes[j++] = ncp->maxmem_get[i]; + sizes[12] = ncp->ina_npairs_put; + sizes[13] = ncp->ina_npairs_get; + + MPI_Allreduce(sizes, max_sizes, 14, MPI_OFFSET, MPI_MAX, ncp->comm); + max_npairs_put = max_sizes[12]; + max_npairs_get = max_sizes[13]; + + for (i=0; i<12; i++) tt[i] = (float)(max_sizes[i]) / 1048576.0; /* in MiB */ + if (ncp->rank == 0 && max_npairs_put > 0) + printf("%s: INA put npairs=%lld mem=%.1f %.1f %.1f %.1f %.1f %.1f (MiB)\n", + __func__, max_sizes[12], tt[0],tt[1],tt[2],tt[3],tt[4],tt[5]); + if (ncp->rank == 0 && max_npairs_get > 0) + printf("%s: INA get npairs=%lld mem=%.1f %.1f %.1f %.1f %.1f %.1f (MiB)\n", + __func__, max_sizes[13], tt[6],tt[7],tt[8],tt[9],tt[10],tt[11]); + + if (max_npairs_put > 0) { /* put npairs > 0 */ + put_time = ncp->ina_time_init + ncp->ina_time_flatten; + ntimers = 4; + for (i=0; iina_time_put[i]; + put_time += tt[i]; + } + tt[ntimers] = ncp->ina_time_init; + tt[ntimers+1] = ncp->ina_time_flatten; + tt[ntimers+2] = put_time; + + MPI_Reduce(tt, max_t, ntimers+3, MPI_DOUBLE, MPI_MAX, 0, ncp->comm); + put_time = max_t[ntimers+2]; + if (ncp->rank == 0) + printf("%s: INA put timing %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f = %5.2f\n", + __func__, max_t[ntimers],max_t[ntimers+1],max_t[0],max_t[1],max_t[2],max_t[3],put_time); + } + if (max_npairs_get > 0) { /* get npairs > 0 */ + get_time = ncp->ina_time_init + ncp->ina_time_flatten; + ntimers = 4; + for (i=0; iina_time_get[i]; + get_time += tt[i]; + } + tt[ntimers] = ncp->ina_time_init; + tt[ntimers+1] = ncp->ina_time_flatten; + tt[ntimers+2] = get_time; + + MPI_Reduce(tt, max_t, ntimers+3, MPI_DOUBLE, MPI_MAX, 0, ncp->comm); + if (ncp->rank == 0) + printf("%s: INA get timing %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f = %5.2f\n", + __func__, max_t[ntimers],max_t[ntimers+1],max_t[0],max_t[1],max_t[2],max_t[3],max_t[ntimers+2]); + } + } +#endif + + /* close the file */ + err = ncmpio_file_close(ncp); if (status == NC_NOERR) status = err; /* file is open for write and no variable has been defined */ @@ -219,6 +245,10 @@ ncmpio_close(void *ncdp) if (ncp->nprocs > 1) MPI_Barrier(ncp->comm); } + /* free the intra-node aggregation communicator */ + if (ncp->ina_comm != MPI_COMM_NULL) + MPI_Comm_free(&ncp->ina_comm); + /* free up space occupied by the header metadata */ ncmpio_free_NC(ncp); diff --git a/src/drivers/ncmpio/ncmpio_create.c b/src/drivers/ncmpio/ncmpio_create.c index e5cee83d3..b1444d0b8 100644 --- a/src/drivers/ncmpio/ncmpio_create.c +++ b/src/drivers/ncmpio/ncmpio_create.c @@ -8,7 +8,6 @@ * This file implements the corresponding APIs defined in src/dispatchers/file.c * * ncmpi_create() : dispatcher->create() - * ncmpi_open() : dispatcher->open() */ #ifdef HAVE_CONFIG_H @@ -42,18 +41,21 @@ ncmpio_create(MPI_Comm comm, MPI_Info user_info, /* user's and env info combined */ void **ncpp) { - char *env_str, *filename, *mpi_name; + char *env_str, *filename, value[MPI_MAX_INFO_VAL + 1], *mpi_name; int rank, nprocs, mpiomode, err, mpireturn, default_format, file_exist=1; - int use_trunc=1; - MPI_File fh; - MPI_Info info_used; + int use_trunc=1, flag; + MPI_File fh=MPI_FILE_NULL; NC *ncp=NULL; *ncpp = NULL; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + /* Note path's validity and cmode consistency have been checked in - * ncmpi_create() in src/dispatchers/file.c and - * path consistency will be done in MPI_File_open */ + * ncmpi_create() in src/dispatchers/file.c and path consistency will be + * done in MPI_File_open. + */ /* First, check whether cmode is valid or supported ---------------------*/ @@ -66,25 +68,61 @@ ncmpio_create(MPI_Comm comm, /* Check cmode for other illegal flags already done in dispatcher layer */ /* Get default format, in case cmode does not include either - * NC_64BIT_OFFSET or NC_64BIT_DATA */ + * NC_64BIT_OFFSET or NC_64BIT_DATA. + */ ncmpi_inq_default_format(&default_format); - /* Handle file clobber --------------------------------------------------*/ - MPI_Comm_rank(comm, &rank); - MPI_Comm_size(comm, &nprocs); + /* allocate buffer for header object NC and initialize its contents */ + ncp = (NC*) NCI_Calloc(1, sizeof(NC)); + if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) + + *ncpp = (void*)ncp; + + ncp->ncid = ncid; + ncp->comm = comm; /* reuse comm duplicated in dispatch layer */ + ncp->rank = rank; + ncp->nprocs = nprocs; + + /* Extract hints from user_info. Two hints must be extracted now in order + * to continue: + * nc_pncio: whether to user MPI-IO or PnetCDF's PNCIO driver. + * nc_num_aggrs_per_node: number of processes per node to be the INA + * aggregators. + * + * ncp->fstype will be set in ncmpio_hint_extract(). + */ + ncmpio_hint_extract(ncp, user_info); + if (ncp->fstype == PNCIO_FSTYPE_CHECK) + /* Check file system type. If the given file does not exist, check its + * folder. Currently PnetCDF's PNCIO drivers support Lustre + * (PNCIO_LUSTRE) and Unix File System (PNCIO_UFS). + */ + ncp->fstype = PNCIO_FileSysType(path); + +#ifdef WKL_DEBUG +if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == PNCIO_FSTYPE_MPIIO)? "PNCIO_FSTYPE_MPIIO" : (ncp->fstype == PNCIO_LUSTRE) ? "PNCIO_LUSTRE" : "PNCIO_UFS"); +#endif + + /* Setting file open mode in mpiomode which may later be needed in + * ncmpi_begin_indep_data() to open file for independent data mode. + */ mpiomode = MPI_MODE_RDWR | MPI_MODE_CREATE; - /* remove the file system type prefix name if there is any. For example, + /* Remove the file system type prefix name if there is any. For example, * when path = "lustre:/home/foo/testfile.nc", remove "lustre:" to make * filename pointing to "/home/foo/testfile.nc", so it can be used in POSIX - * access() below + * access() below. */ filename = ncmpii_remove_file_system_type_prefix(path); - /* Check if the file already exists, if lstat() or access() is available */ + /* In case of clobber mode, first check if the file already exists, through + * a call to lstat() or access() if they are is available. If not, we + * assume the file exists and will add some MPI flag to open mode argument + * of MPI_File_open to either delete or truncate the file first. + */ #ifdef HAVE_LSTAT - /* call lstat() to check the file if exists and if is a symbolic link */ + /* Call lstat() to check the file if exists and if is a symbolic link */ if (rank == 0) { struct stat st_buf; st_buf.st_mode = 0; @@ -92,21 +130,23 @@ ncmpio_create(MPI_Comm comm, if (lstat(filename, &st_buf) == -1) file_exist = 0; errno = 0; /* reset errno */ - /* If the file is a regular file, not a symbolic link, then we can - * delete the file first and later create it when calling - * MPI_File_open() with MPI_MODE_CREATE. It is OK to delete and then - * re-create the file if the file is a regular file. If there are other - * files symbolically linked to this file, then their links will still - * point to this file after it is re-created. + /* If the file is a regular file, not a symbolic link, then we delete + * the file first and later create it when calling MPI_File_open() with + * MPI_MODE_CREATE. If the file is a regular file, not a symbolic link, + * it is faster to delete it and then re-create the file, as truncating + * it to zero size is more expensive. * * If the file is a symbolic link, then we cannot delete the file, as - * the link will be gone. + * the link will be gone. If the file is deleted and there are other + * files symbolically linked to this file, then their links will become + * invalid. */ if (S_ISREG(st_buf.st_mode)) use_trunc = 0; } #elif defined HAVE_ACCESS - /* if access() is available, use it to check whether file already exists - * rank 0 calls access() and broadcasts file_exist */ + /* If access() is available, use it to check whether file already exists, + * by having rank 0 to call access() and broadcast file_exist. + */ if (rank == 0) { if (access(filename, F_OK) == -1) file_exist = 0; errno = 0; /* reset errno */ @@ -114,21 +154,29 @@ ncmpio_create(MPI_Comm comm, #endif if (fIsSet(cmode, NC_NOCLOBBER)) { - /* check if file exists: NC_EEXIST is returned if the file already - * exists and NC_NOCLOBBER mode is used in ncmpi_create */ + /* Error NC_EEXIST will be returned, if the file already exists and + * NC_NOCLOBBER mode is set in ncmpi_create. + */ #ifdef HAVE_ACCESS if (nprocs > 1) TRACE_COMM(MPI_Bcast)(&file_exist, 1, MPI_INT, 0, comm); - if (file_exist) DEBUG_RETURN_ERROR(NC_EEXIST) + if (file_exist) { + NCI_Free(ncp); + DEBUG_RETURN_ERROR(NC_EEXIST) + } #else - /* add MPI_MODE_EXCL mode for MPI_File_open to check file existence */ + /* Add MPI_MODE_EXCL mode for MPI_File_open, so it can error out, if + * the file exists. + */ fSet(mpiomode, MPI_MODE_EXCL); errno = 0; /* reset errno, as MPI_File_open may change it */ #endif } - else { /* NC_CLOBBER is the default mode in create */ - /* rank 0 truncates or deletes the file and ignores error code. - * Note calling MPI_File_set_size is expensive as it calls truncate() + else { + /* NC_CLOBBER is the default mode in ncmpi_create(). Below, rank 0 + * truncates or deletes the file and ignores error code. Note in some + * implementation of MPI-IO, calling MPI_File_set_size is expensive as + * it may call truncate() by all ranks. */ err = NC_NOERR; if (rank == 0 && file_exist) { @@ -140,27 +188,37 @@ ncmpio_create(MPI_Comm comm, err = unlink(filename); if (err < 0 && errno != ENOENT) /* ignore ENOENT: file not exist */ - DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* other error */ + DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* report other error */ else err = NC_NOERR; #else err = NC_NOERR; - TRACE_IO(MPI_File_delete, ((char *)path, MPI_INFO_NULL)); - if (mpireturn != MPI_SUCCESS) { - int errorclass; - MPI_Error_class(mpireturn, &errorclass); - if (errorclass != MPI_ERR_NO_SUCH_FILE) - /* ignore file not exist */ - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) + err = PNCIO_File_delete(filename); + else { + TRACE_IO(MPI_File_delete, (path, MPI_INFO_NULL)); + if (mpireturn != MPI_SUCCESS) { + int errorclass; + MPI_Error_class(mpireturn, &errorclass); + if (errorclass != MPI_ERR_NO_SUCH_FILE) + /* ignore file not exist */ + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } } #endif } - else { /* file is not a regular file, truncate it to zero size */ + else { + /* If file is not a regular file (e.g. a symbolic link), we + * cannot delete it and must truncate it to zero size. In this + * case, file open mode needs to remove MPI_MODE_CREATE. + */ + mpiomode = MPI_MODE_RDWR; + #ifdef HAVE_TRUNCATE - err = truncate(filename, 0); /* can be expensive */ + err = truncate(filename, 0); /* This may be expensive */ if (err < 0 && errno != ENOENT) /* ignore ENOENT: file not exist */ - DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* other error */ + DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* report other error */ else err = NC_NOERR; #elif defined HAVE_OPEN @@ -173,82 +231,80 @@ ncmpio_create(MPI_Comm comm, DEBUG_ASSIGN_ERROR(err, NC_EFILE) } #else - /* call MPI_File_set_size() to truncate the file. Note this can - * be expensive. + /* When all POSIX system calls are not available, the last + * resort is to call MPI_File_set_size() to truncate the file. + * Note for some ROMIO versions that have all processes call + * truncate(), this option can be expensive. */ err = NC_NOERR; - TRACE_IO(MPI_File_open, (MPI_COMM_SELF, (char *)path, MPI_MODE_RDWR, MPI_INFO_NULL, &fh)); - if (mpireturn != MPI_SUCCESS) { - int errorclass; - MPI_Error_class(mpireturn, &errorclass); - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + PNCIO_File pncio_fh; + pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); + err = PNCIO_File_open(MPI_COMM_SELF, filename, + MPI_MODE_RDWR, MPI_INFO_NULL, + pncio_fh); + if (err == NC_NOERR) + PNCIO_File_set_size(pncio_fh, 0); /* can be expensive */ + else + PNCIO_File_close(&pncio_fh); + NCI_Free(pncio_fh); } else { - TRACE_IO(MPI_File_set_size, (fh, 0)); /* can be expensive */ + TRACE_IO(MPI_File_open, (MPI_COMM_SELF, path, MPI_MODE_RDWR, MPI_INFO_NULL, &fh)); if (mpireturn != MPI_SUCCESS) { int errorclass; MPI_Error_class(mpireturn, &errorclass); err = ncmpii_error_mpi2nc(mpireturn, mpi_name); } else { - TRACE_IO(MPI_File_close, (&fh)); + TRACE_IO(MPI_File_set_size, (fh, 0)); /* can be expensive */ if (mpireturn != MPI_SUCCESS) { int errorclass; MPI_Error_class(mpireturn, &errorclass); err = ncmpii_error_mpi2nc(mpireturn, mpi_name); } + else { + TRACE_IO(MPI_File_close, (&fh)); + if (mpireturn != MPI_SUCCESS) { + int errorclass; + MPI_Error_class(mpireturn, &errorclass); + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + } } } #endif } if (errno == ENOENT) errno = 0; /* reset errno */ } - /* all processes must wait here until file deletion is completed */ - if (nprocs > 1) - TRACE_COMM(MPI_Bcast)(&err, 1, MPI_INT, 0, comm); - if (err != NC_NOERR) return err; - } - - /* create file collectively -------------------------------------------- */ - TRACE_IO(MPI_File_open, (comm, (char *)path, mpiomode, user_info, &fh)); - if (mpireturn != MPI_SUCCESS) { -#ifndef HAVE_ACCESS - if (fIsSet(cmode, NC_NOCLOBBER)) { - /* This is the case when NC_NOCLOBBER is used in file creation and - * function access() is not available. MPI_MODE_EXCL is set in open - * mode. When MPI_MODE_EXCL is used and the file already exists, - * MPI-IO should return error class MPI_ERR_FILE_EXISTS. But, some - * MPI-IO implementations (older ROMIO) do not correctly return - * this error class. In this case, we can do the followings: check - * errno to see if it set to EEXIST. Note usually rank 0 makes the - * file open call and can be the only one having errno set. - */ - if (nprocs > 1) - TRACE_COMM(MPI_Bcast)(&errno, 1, MPI_INT, 0, comm); - if (errno == EEXIST) DEBUG_RETURN_ERROR(NC_EEXIST) - } -#endif - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - /* for NC_NOCLOBBER, MPI_MODE_EXCL was added to mpiomode. If the file - * already exists, MPI-IO should return error class MPI_ERR_FILE_EXISTS - * which PnetCDF will return error code NC_EEXIST. This is checked - * inside of ncmpii_error_mpi2nc() + /* All processes must wait here until clobbering file by root process + * is completed. Note mpiomode may be changed to remove MPI_MODE_CREATE + * when the file to be clobbered is a symbolic link. */ + if (nprocs > 1) { + int msg[2]; + msg[0] = err; + msg[1] = mpiomode; + TRACE_COMM(MPI_Bcast)(&msg, 2, MPI_INT, 0, comm); + err = msg[0]; + mpiomode = msg[1]; + } + if (err != NC_NOERR) return err; } - else - /* reset errno, as MPI_File_open may change it, even for MPI_SUCCESS */ - errno = 0; + /* Now file has been clobbered, i.e. deleted if it is not a symbolic link. + * If it is a symbolic link, it now has been truncated to zero size. + */ - /* get the I/O hints used/modified by MPI-IO */ - TRACE_IO(MPI_File_get_info, (fh, &info_used)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); + ncp->path = path; /* reuse path duplicated in dispatch layer */ + ncp->pncio_fh = NULL; /* non-aggregators have NULL pncio_fh */ + ncp->mpiomode = mpiomode; + ncp->mpiinfo = MPI_INFO_NULL; - /* Now the file has been successfully created, allocate/set NC object */ + /* For file create, ignore NC_NOWRITE if set in cmode argument. */ + ncp->iomode = cmode | NC_WRITE; - /* allocate buffer for header object NC and initialize its contents */ - ncp = (NC*) NCI_Calloc(1, sizeof(NC)); - if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) + ncp->collective_fh = MPI_FILE_NULL; + ncp->independent_fh = MPI_FILE_NULL; /* set the file format version based on the create mode, cmode */ if (fIsSet(cmode, NC_64BIT_DATA)) ncp->format = 5; @@ -259,6 +315,7 @@ ncmpio_create(MPI_Comm comm, else ncp->format = 1; } + /* indicate this is from ncmpi_create */ fSet(ncp->flags, NC_MODE_CREATE); /* create automatically enter write mode */ fClr(ncp->flags, NC_MODE_RDONLY); @@ -267,44 +324,13 @@ ncmpio_create(MPI_Comm comm, /* PnetCDF default mode is no fill */ fClr(ncp->flags, NC_MODE_FILL); - ncp->ncid = ncid; - - /* chunk size for reading header, set to default before check hints */ - ncp->chunk = PNC_DEFAULT_CHUNKSIZE; - - /* calculate the true header size (not-yet aligned) - * No need to do this now. - * ncp->xsz = ncmpio_hdr_len_NC(ncp); - */ - /* initialize unlimited_id as no unlimited dimension yet defined */ ncp->dims.unlimited_id = -1; - /* buffer to pack noncontiguous user buffers when calling wait() */ - ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE; - - /* Extract PnetCDF specific I/O hints from user_info and set default hint - * values into info_used. Note some MPI libraries, such as MPICH 3.3.1 and - * priors fail to preserve user hints that are not recognized by the MPI - * libraries. - */ - ncmpio_set_pnetcdf_hints(ncp, user_info, info_used); - - /* For file create, ignore if NC_NOWRITE set in cmode by user */ - ncp->iomode = cmode | NC_WRITE; - ncp->comm = comm; /* reuse comm duplicated in dispatch layer */ - ncp->mpiinfo = info_used; /* is not MPI_INFO_NULL */ - ncp->mpiomode = mpiomode; - ncp->rank = rank; - ncp->nprocs = nprocs; - ncp->collective_fh = fh; - ncp->independent_fh = (nprocs > 1) ? MPI_FILE_NULL : fh; - ncp->path = (char*) NCI_Malloc(strlen(path) + 1); - strcpy(ncp->path, path); - #ifdef PNETCDF_DEBUG /* PNETCDF_DEBUG is set at configure time, which will be overwritten by - * the run-time environment variable PNETCDF_SAFE_MODE */ + * the run-time environment variable PNETCDF_SAFE_MODE. + */ ncp->safe_mode = 1; #endif /* If environment variable PNETCDF_SAFE_MODE is set to 1, then we perform @@ -313,24 +339,211 @@ ncmpio_create(MPI_Comm comm, if ((env_str = getenv("PNETCDF_SAFE_MODE")) != NULL) { if (*env_str == '0') ncp->safe_mode = 0; else ncp->safe_mode = 1; - /* if PNETCDF_SAFE_MODE is set but without a value, *env_str can - * be '\0' (null character). In this case, safe_mode is enabled */ + /* If PNETCDF_SAFE_MODE is set but without a value, *env_str can + * be '\0' (null character). In this case, safe_mode is enabled. + */ } - /* determine whether to enable intra-node aggregation and set up all - * intra-node aggregation metadata. - * ncp->num_aggrs_per_node = 0, or non-zero indicates whether this feature - * is enabled globally for all processes. - * ncp->my_aggr = -1 or >= 0 indicates whether aggregation is effectively - * enabled for the aggregation group of this process. + /* Construct a list of unique IDs of compute nodes allocated to this job + * and save it in ncp->node_ids[nprocs], which contains node IDs of each + * rank. The node IDs are used either when intra-node aggregation (INA) is + * enabled or when using PnetCDF's PNCIO driver. + * + * When intra-node aggregation (INA) is enabled, node IDs are used to + * create a new MPI communicator consisting of the intra-node aggregators + * only. The communicator will be used to call file open in MPI-IO or + * PnetCDF's PNCIO driver. This means only intra-node aggregators will + * perform file I/O in PnetCDF collective put and get operations. */ - ncp->my_aggr = -1; - if (ncp->num_aggrs_per_node != 0) { - err = ncmpio_intra_node_aggr_init(ncp); + ncp->node_ids = NULL; + if (ncp->fstype != PNCIO_FSTYPE_MPIIO || ncp->num_aggrs_per_node > 0) { + err = ncmpii_construct_node_list(comm, &ncp->num_nodes, &ncp->node_ids); if (err != NC_NOERR) return err; + + /* When the total number of aggregators >= number of processes, disable + * intra-node aggregation. + */ + if (ncp->num_aggrs_per_node * ncp->num_nodes >= ncp->nprocs) + ncp->num_aggrs_per_node = 0; } - *ncpp = (void*)ncp; + /* ncp->num_aggrs_per_node = 0, or > 0 is an indicator of whether the INA + * feature is disabled or enabled globally for all processes. + */ + ncp->my_aggr = -1; + ncp->ina_comm = MPI_COMM_NULL; + ncp->ina_nprocs = 0; + ncp->ina_rank = -1; + ncp->ina_node_list = NULL; + if (ncp->num_aggrs_per_node > 0) { + /* Divide all ranks into groups. Each group is assigned one intra-node + * aggregator. The following metadata related to intra-node aggregation + * will be set up in ncmpio_ina_init(). + * ncp->my_aggr is the aggregator's rank ID (related to ncp->comm) of + * this group. When == ncp->rank, this rank is an aggregator. + * ncp->num_nonaggrs is the number of non-aggregators assigned to this + * rank (an aggregator) + * ncp->ina_comm is an MPI communicator consisting of only intra-node + * aggregators across all nodes, which will be used when calling + * MPI_File_open(). For non-aggregator, it == MPI_COMM_NULL. + * ncp->node_ids[] will be modified to contain the nodes IDs of all + * intra-node aggregators, and will be passed to pncio_fh. + */ + err = ncmpio_ina_init(ncp); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + + /* As non-aggregators will not perform any file I/O, we now can replace + * comm with ina_comm. Same for nprocs. + */ + comm = ncp->ina_comm; + nprocs = ncp->ina_nprocs; + + /* For non-aggregators, comm is MPI_COMM_NULL. As the remaining task of + * this subroutine is to open the file and obtain the file handler, + * non-aggregators can skip. + */ + if (comm == MPI_COMM_NULL) { + MPI_Info_create(&ncp->mpiinfo); + goto fn_exit; + } + } + + /* create file collectively -------------------------------------------- */ + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + TRACE_IO(MPI_File_open, (comm, path, mpiomode, user_info, &fh)); + if (mpireturn != MPI_SUCCESS) { +#ifndef HAVE_ACCESS + if (fIsSet(cmode, NC_NOCLOBBER)) { + /* This is the case when NC_NOCLOBBER is used in file creation + * and function access() is not available. MPI_MODE_EXCL is set + * in open mode. When MPI_MODE_EXCL is used and the file + * already exists, MPI-IO should return error class + * MPI_ERR_FILE_EXISTS. But, some MPI-IO implementations (older + * ROMIO) do not correctly return this error class. In this + * case, we can do the followings: check errno to see if it set + * to EEXIST. Note usually rank 0 makes the file open call and + * can be the only one having errno set. + */ + if (nprocs > 1) + TRACE_COMM(MPI_Bcast)(&errno, 1, MPI_INT, 0, comm); + if (errno == EEXIST) { + NCI_Free(ncp); + DEBUG_FOPEN_ERROR(NC_EEXIST) + } + } +#endif + err = ncmpii_error_mpi2nc(mpireturn, "MPI_File_open"); + DEBUG_FOPEN_ERROR(err); + /* for NC_NOCLOBBER, MPI_MODE_EXCL was added to mpiomode. If the + * file already exists, MPI-IO should return error class + * MPI_ERR_FILE_EXISTS which PnetCDF will return error code + * NC_EEXIST. This is checked inside of ncmpii_error_mpi2nc() + */ + } + else + /* reset errno, as MPI_File_open may change it, even if it returns + * MPI_SUCCESS + */ + errno = 0; + + /* Now the file has been successfully created */ + ncp->collective_fh = fh; + ncp->independent_fh = (nprocs == 1) ? fh : MPI_FILE_NULL; + + /* get the I/O hints used/modified by MPI-IO */ + TRACE_IO(MPI_File_get_info, (fh, &ncp->mpiinfo)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + DEBUG_FOPEN_ERROR(err); + } + } + else { + /* When ncp->fstype != PNCIO_FSTYPE_MPIIO, use PnetCDF's PNCIO driver */ + ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1, sizeof(PNCIO_File)); + ncp->pncio_fh->file_system = ncp->fstype; + ncp->pncio_fh->num_nodes = ncp->num_nodes; + ncp->pncio_fh->node_ids = ncp->node_ids; + + err = PNCIO_File_open(comm, filename, mpiomode, user_info, + ncp->pncio_fh); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err) + + /* Now the file has been successfully created, obtain the I/O hints + * used/modified by PNCIO driver. + */ + err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err) + } + + /* Copy MPI-IO hints into ncp->mpiinfo */ + ncmpio_hint_set(ncp, ncp->mpiinfo); + +fn_exit: + if (ncp->num_aggrs_per_node > 0) { + /* When intra-node aggregation is enabled, it is necessary to make sure + * non-aggregators obtain consistent values of file striping hints. + * + * non-aggregator do not have hints returned from MPI_File_get_info() + */ + int striping_info[2]; + if (ncp->rank == 0) { + MPI_Info_get(ncp->mpiinfo, "striping_unit", MPI_MAX_INFO_VAL-1, + value, &flag); + striping_info[0] = 0; + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + striping_info[0] = (int)strtol(value,NULL,10); + if (errno != 0) striping_info[0] = 0; + } + + MPI_Info_get(ncp->mpiinfo, "striping_factor", MPI_MAX_INFO_VAL-1, + value, &flag); + striping_info[1] = 0; + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + striping_info[1] = (int)strtol(value,NULL,10); + if (errno != 0) striping_info[1] = 0; + } + } + + MPI_Bcast(striping_info, 2, MPI_INT, 0, ncp->comm); + + if (ncp->my_aggr != ncp->rank) { + sprintf(value, "%d", striping_info[0]); + MPI_Info_set(ncp->mpiinfo, "striping_unit", value); + sprintf(value, "%d", striping_info[1]); + MPI_Info_set(ncp->mpiinfo, "striping_factor", value); + } + } + +/* +if (ncp->rank == 0) { + int i, nkeys; + MPI_Info_get_nkeys(ncp->mpiinfo, &nkeys); + printf("%s line %d: MPI File Info: nkeys = %d\n",__func__,__LINE__,nkeys); + for (i=0; impiinfo, i, key); + MPI_Info_get_valuelen(ncp->mpiinfo, key, &valuelen, &flag); + MPI_Info_get(ncp->mpiinfo, key, valuelen+1, value, &flag); + printf("MPI File Info: [%2d] key = %25s, value = %s\n",i,key,value); + } +} +*/ + + /* ina_node_list is no longer needed */ + if (ncp->ina_node_list != NULL) { + NCI_Free(ncp->ina_node_list); + ncp->ina_node_list = NULL; + } + /* node_ids is no longer needed */ + if (ncp->node_ids != NULL) { + NCI_Free(ncp->node_ids); + ncp->node_ids = NULL; + } + if (ncp->pncio_fh != NULL) + ncp->pncio_fh->node_ids = NULL; return NC_NOERR; } diff --git a/src/drivers/ncmpio/ncmpio_enddef.c b/src/drivers/ncmpio/ncmpio_enddef.c index efc99657a..b45219a91 100644 --- a/src/drivers/ncmpio/ncmpio_enddef.c +++ b/src/drivers/ncmpio/ncmpio_enddef.c @@ -118,8 +118,8 @@ move_file_block(NC *ncp, get_size = pread(fd, buf, chunk_size, off_from); if (get_size < 0) { fprintf(stderr, - "Error at %s line %d: pread file %s offset "OFFFMT" size %zd (%s)\n", - __func__,__LINE__,path,off_from,chunk_size,strerror(errno)); + "Error at %s line %d: pread file %s offset %lld size %zd (%s)\n", + __func__,__LINE__,path,(long long)off_from,chunk_size,strerror(errno)); DEBUG_RETURN_ERROR(NC_EREAD) } ncp->get_size += get_size; @@ -138,8 +138,8 @@ move_file_block(NC *ncp, put_size = pwrite(fd, buf, get_size, off_to); if (put_size < 0) { fprintf(stderr, - "Error at %s line %d: pwrite file %s offset "OFFFMT" size %zd (%s)\n", - __func__,__LINE__,path,off_to,get_size,strerror(errno)); + "Error at %s line %d: pwrite file %s offset %lld size %zd (%s)\n", + __func__,__LINE__,path,(long long)off_to,get_size,strerror(errno)); DEBUG_RETURN_ERROR(NC_EREAD) } ncp->put_size += put_size; @@ -167,21 +167,21 @@ move_file_block(NC *ncp, MPI_Offset from, /* source starting file offset */ MPI_Offset nbytes) /* amount to be moved */ { - char *mpi_name; - int rank, nprocs, mpireturn, err, status=NC_NOERR, do_coll; + int rank, nprocs, status=NC_NOERR, do_coll; void *buf; size_t num_moves, mv_amnt, p_units; - MPI_Offset off_last, off_from, off_to; - MPI_Status mpistatus; - MPI_File fh; + MPI_Offset off_last, off_from, off_to, rlen, wlen; + MPI_Comm comm; - rank = ncp->rank; - nprocs = ncp->nprocs; - - /* collective_fh can be used in either MPI independent or collective I/O - * APIs to move data, within this subroutine. + /* If intra-node aggregation is enabled, then only the aggregators perform + * the movement. */ - fh = ncp->collective_fh; + if (ncp->num_aggrs_per_node > 0 && ncp->ina_comm == MPI_COMM_NULL) + return NC_NOERR; + + comm = (ncp->ina_comm == MPI_COMM_NULL) ? ncp->comm : ncp->ina_comm; + rank = (ncp->ina_comm == MPI_COMM_NULL) ? ncp->rank : ncp->ina_rank; + nprocs = (ncp->ina_comm == MPI_COMM_NULL) ? ncp->nprocs : ncp->ina_nprocs; /* MPI-IO fileview has been reset in ncmpi_redef() to make the entire file * visible @@ -192,7 +192,7 @@ move_file_block(NC *ncp, * independent I/O subroutines, as the data partitioned among processes are * not interleaved and thus need no collective I/O. */ - do_coll = (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)); + do_coll = (nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)); /* buf will be used as a temporal buffer to move data in chunks, i.e. * read a chunk and later write to the new location @@ -211,7 +211,7 @@ move_file_block(NC *ncp, /* move the data section starting from its tail toward its beginning */ while (nbytes > 0) { - int chunk_size, get_size=0; + int chunk_size; if (mv_amnt == p_units) { /* each rank moves amount of chunk_size */ @@ -231,88 +231,33 @@ move_file_block(NC *ncp, chunk_size = 0; } - /* explicitly initialize mpistatus object to 0. For zero-length read, - * MPI_Get_count may report incorrect result for some MPICH version, - * due to the uninitialized MPI_Status object passed to MPI-IO calls. - * Thus we initialize it above to work around. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); - mpireturn = MPI_SUCCESS; + PNCIO_View buf_view; + buf_view.type = MPI_BYTE; + buf_view.size = chunk_size; + buf_view.count = 1; + buf_view.is_contig = 1; /* read from file at off_from for amount of chunk_size */ - if (do_coll) { - TRACE_IO(MPI_File_read_at_all, (fh, off_from, buf, chunk_size, - MPI_BYTE, &mpistatus)); - } - else if (chunk_size > 0) { - TRACE_IO(MPI_File_read_at, (fh, off_from, buf, chunk_size, - MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR && err == NC_EFILE) - DEBUG_ASSIGN_ERROR(status, NC_EREAD) - get_size = chunk_size; - } - else if (chunk_size > 0) { - /* for zero-length read, MPI_Get_count may report incorrect result - * for some MPICH version, due to the uninitialized MPI_Status - * object passed to MPI-IO calls. Thus we initialize it above to - * work around. See MPICH ticket: - * https://trac.mpich.org/projects/mpich/ticket/2332 - * - * Update the number of bytes read since file open. - * Because each rank reads and writes no more than one chunk_size - * at a time and chunk_size is < NC_MAX_INT, it is OK to call - * MPI_Get_count, instead of MPI_Get_count_c. - */ - MPI_Get_count(&mpistatus, MPI_BYTE, &get_size); - ncp->get_size += get_size; - } + rlen = 0; + if (do_coll) + rlen = ncmpio_file_read_at_all(ncp, off_from, buf, buf_view); + else if (chunk_size > 0) + rlen = ncmpio_file_read_at(ncp, off_from, buf, buf_view); + if (status == NC_NOERR && rlen < 0) status = (int)rlen; /* to prevent from one rank's write run faster than other's read */ - if (ncp->nprocs > 1) MPI_Barrier(ncp->comm); - - /* explicitly initialize mpistatus object to 0. For zero-length read, - * MPI_Get_count may report incorrect result for some MPICH version, - * due to the uninitialized MPI_Status object passed to MPI-IO calls. - * Thus we initialize it above to work around. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); - mpireturn = MPI_SUCCESS; + if (nprocs > 1) MPI_Barrier(comm); - /* Write to new location at off_to for amount of get_size. Assuming the - * call to MPI_Get_count() above returns the actual amount of data read - * from the file, i.e. get_size. + /* Write to new location at off_to for amount of rlen, the actual read + * amount is rlen. */ - if (do_coll) { - TRACE_IO(MPI_File_write_at_all, (fh, off_to, buf, - get_size /* NOT chunk_size */, - MPI_BYTE, &mpistatus)); - } - else if (get_size > 0) { - TRACE_IO(MPI_File_write_at, (fh, off_to, buf, - get_size /* NOT chunk_size */, - MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR && err == NC_EFILE) - DEBUG_ASSIGN_ERROR(status, NC_EWRITE) - } - else if (get_size > 0) { - /* update the number of bytes written since file open. - * Because each rank reads and writes no more than one chunk_size - * at a time and chunk_size is < NC_MAX_INT, it is OK to call - * MPI_Get_count, instead of MPI_Get_count_c. - */ - int put_size; - mpireturn = MPI_Get_count(&mpistatus, MPI_BYTE, &put_size); - if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED) - ncp->put_size += get_size; /* or chunk_size */ - else - ncp->put_size += put_size; - } + buf_view.size = rlen; + wlen = 0; + if (do_coll && rlen > 0) + wlen = ncmpio_file_write_at_all(ncp, off_to, buf, buf_view); + else if (rlen > 0) + wlen = ncmpio_file_write_at(ncp, off_to, buf, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; /* move on to the next round */ mv_amnt = p_units; @@ -602,20 +547,26 @@ NC_begins(NC *ncp) static int write_NC(NC *ncp) { - char *mpi_name; - int status=NC_NOERR, mpireturn, err, is_coll; + int status=NC_NOERR, is_coll=0; MPI_Offset i, header_wlen, ntimes; - MPI_File fh; - MPI_Status mpistatus; + PNCIO_View buf_view; assert(!NC_readonly(ncp)); + buf_view.is_contig = 1; + /* Depending on whether NC_HCOLL is set, writing file header can be done * through either MPI collective or independent write call. * When * ncp->nprocs == 1, ncp->collective_fh == ncp->independent_fh + * For those ranks participating the collective MPI write call, their + * is_coll is set to 1, otherwise 0. */ - is_coll = (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)) ? 1 : 0; - fh = ncp->collective_fh; + if (fIsSet(ncp->flags, NC_HCOLL)) { + if (ncp->num_aggrs_per_node > 0) + is_coll = (ncp->ina_nprocs > 1 && ncp->rank == ncp->my_aggr); + else + is_coll = (ncp->nprocs > 1); + } /* In NC_begins(), root's ncp->xsz and ncp->begin_var, root's header * size and extent, have been broadcast (sync-ed) among processes. @@ -673,64 +624,44 @@ write_NC(NC *ncp) /* rank 0's fileview already includes the file header */ - /* explicitly initialize mpistatus object to 0. For zero-length read, - * MPI_Get_count may report incorrect result for some MPICH version, - * due to the uninitialized MPI_Status object passed to MPI-IO calls. - * Thus we initialize it above to work around. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); - /* write the header in chunks */ offset = 0; remain = header_wlen; buf_ptr = buf; + buf_view.type = MPI_BYTE; + buf_view.count = 1; for (i=0; iput_size += bufCount; - else - ncp->put_size += put_size; - } - offset += bufCount; - buf_ptr += bufCount; - remain -= bufCount; + MPI_Offset wlen; + buf_view.size = MIN(remain, NC_MAX_INT); + if (is_coll) + wlen = ncmpio_file_write_at_all(ncp, offset, buf_ptr, buf_view); + else + wlen = ncmpio_file_write_at(ncp, offset, buf_ptr, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + + offset += buf_view.size; + buf_ptr += buf_view.size; + remain -= buf_view.size; } NCI_Free(buf); } - else if (fIsSet(ncp->flags, NC_HCOLL)) { + else if (is_coll) { /* other processes participate the collective call */ - for (i=0; isafe_mode == 1 && ncp->nprocs > 1) { /* broadcast root's status, because only root writes to the file */ - int root_status = status; + int mpireturn, root_status = status; TRACE_COMM(MPI_Bcast)(&root_status, 1, MPI_INT, 0, ncp->comm); - /* root's write has failed, which is more serious than inconsistency */ - if (root_status == NC_EWRITE) DEBUG_ASSIGN_ERROR(status, NC_EWRITE) + if (mpireturn != MPI_SUCCESS) + status = ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast"); + else if (root_status == NC_EWRITE) + /* root's write has failed, more serious than inconsistency */ + DEBUG_ASSIGN_ERROR(status, NC_EWRITE) } fClr(ncp->flags, NC_NDIRTY); @@ -746,14 +677,14 @@ write_NC(NC *ncp) */ #define CHECK_ERROR(err) { \ if (ncp->safe_mode == 1 && ncp->nprocs > 1) { \ - int status; \ - TRACE_COMM(MPI_Allreduce)(&err, &status, 1, MPI_INT, MPI_MIN, \ + int min_err; \ + TRACE_COMM(MPI_Allreduce)(&err, &min_err, 1, MPI_INT, MPI_MIN, \ ncp->comm); \ if (mpireturn != MPI_SUCCESS) { \ err = ncmpii_error_mpi2nc(mpireturn, "MPI_Allreduce"); \ DEBUG_RETURN_ERROR(err) \ } \ - if (status != NC_NOERR) return status; \ + if (min_err != NC_NOERR) return min_err; \ } \ else if (err != NC_NOERR) \ return err; \ @@ -1120,7 +1051,7 @@ read_hints(NC *ncp) /* get hints from the environment variable PNETCDF_HINTS, a string of * hints separated by ";" and each hint is in the form of hint=value. E.g. - * "cb_nodes=16;cb_config_list=*:6". If this environment variable is set, + * "cb_nodes=16;romio_ds_write=true". If this environment variable is set, * it overrides the same hints that were set by MPI_Info_set() called in * the application program. */ @@ -1309,26 +1240,28 @@ ncmpio__enddef(void *ncdp, if (ncp->r_align == 0) ncp->r_align = 4; else ncp->r_align = D_RNDUP(ncp->r_align, 4); - /* reflect the hint changes to the MPI info object, so the user can inquire - * what the true hint values are being used - */ - sprintf(value, OFFFMT, ncp->v_align); - MPI_Info_set(ncp->mpiinfo, "nc_var_align_size", value); - sprintf(value, OFFFMT, ncp->r_align); - MPI_Info_set(ncp->mpiinfo, "nc_record_align_size", value); + if (ncp->mpiinfo != MPI_INFO_NULL) { + /* reflect the hint changes to the MPI info object, so the user can + * inquire what the true hint values are being used + */ + sprintf(value, OFFFMT, ncp->v_align); + MPI_Info_set(ncp->mpiinfo, "nc_var_align_size", value); + sprintf(value, OFFFMT, ncp->r_align); + MPI_Info_set(ncp->mpiinfo, "nc_record_align_size", value); #ifdef ENABLE_SUBFILING - sprintf(value, "%d", ncp->num_subfiles); - MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", value); - if (ncp->num_subfiles > 1) { - /* TODO: should return subfile-related msg when there's an error */ - err = ncmpio_subfile_partition(ncp); - CHECK_ERROR(err) - } + sprintf(value, "%d", ncp->num_subfiles); + MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", value); + if (ncp->num_subfiles > 1) { + /* TODO: should return subfile-related msg when there's an error */ + err = ncmpio_subfile_partition(ncp); + CHECK_ERROR(err) + } #else - MPI_Info_set(ncp->mpiinfo, "pnetcdf_subfiling", "disable"); - MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", "0"); + MPI_Info_set(ncp->mpiinfo, "pnetcdf_subfiling", "disable"); + MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", "0"); #endif + } /* check whether sizes of all variables are legal */ err = ncmpio_NC_check_vlens(ncp); @@ -1491,7 +1424,8 @@ ncmpio__enddef(void *ncdp, /* first sync header objects in memory across all processes, and then root * writes the header to file. Note safe_mode error check will be done in - * write_NC() */ + * write_NC(). + */ status = write_NC(ncp); /* we should continue to exit define mode, even if header is inconsistent diff --git a/src/drivers/ncmpio/ncmpio_file_io.c b/src/drivers/ncmpio/ncmpio_file_io.c index 681cfd599..e519a3d96 100644 --- a/src/drivers/ncmpio/ncmpio_file_io.c +++ b/src/drivers/ncmpio/ncmpio_file_io.c @@ -17,313 +17,909 @@ #include #include "ncmpio_NC.h" -/*----< ncmpio_read_write() >------------------------------------------------*/ -int -ncmpio_read_write(NC *ncp, - int rw_flag, /* NC_REQ_WR or NC_REQ_RD */ - int coll_indep, /* NC_REQ_COLL or NC_REQ_INDEP */ - MPI_Offset offset, - MPI_Offset buf_count, - MPI_Datatype buf_type, - void *buf, - int buftype_is_contig) +/*----< get_count() >--------------------------------------------------------*/ +/* This subroutine is independent. On success, the number of bytes read/written + * is returned (zero indicates nothing was read/written). Like POSIX read()/ + * write(), it is not an error if this number is smaller than the number of + * bytes requested. On error, a negative value, an NC error code, is returned. + */ +static +MPI_Offset get_count(MPI_Status *mpistatus, + MPI_Datatype datatype) { - char *mpi_name; - int status=NC_NOERR, err=NC_NOERR, mpireturn; - MPI_Status mpistatus; - MPI_File fh; - MPI_Offset req_size; + int mpireturn; + + if (datatype == MPI_DATATYPE_NULL) return 0; #ifdef HAVE_MPI_TYPE_SIZE_C - MPI_Count btype_size; + MPI_Count type_size; /* MPI_Type_size_c is introduced in MPI 4.0 */ - mpireturn = MPI_Type_size_c(buf_type, &btype_size); - mpi_name = "MPI_Type_size_c"; + MPI_Type_size_c(datatype, &type_size); #elif defined(HAVE_MPI_TYPE_SIZE_X) - MPI_Count btype_size; + MPI_Count type_size; /* MPI_Type_size_x is introduced in MPI 3.0 */ - mpireturn = MPI_Type_size_x(buf_type, &btype_size); - mpi_name = "MPI_Type_size_x"; + MPI_Type_size_x(datatype, &type_size); #else - int btype_size; - mpireturn = MPI_Type_size(buf_type, &btype_size); - mpi_name = "MPI_Type_size"; + int type_size; + MPI_Type_size(datatype, &type_size); #endif - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - /* return the first encountered error if there is any */ - err = (err == NC_EFILE) ? NC_EREAD : err; - } - else if (btype_size == MPI_UNDEFINED) { -#ifdef PNETCDF_DEBUG - fprintf(stderr,"%d: %s line %d: btype_size MPI_UNDEFINED buf_count="OFFFMT"\n", - ncp->rank, __func__,__LINE__,buf_count); + +#ifdef HAVE_MPI_GET_COUNT_C + MPI_Count count; + mpireturn = MPI_Get_count_c(mpistatus, datatype, &count); +#else + int count; + mpireturn = MPI_Get_count(mpistatus, datatype, &count); #endif - DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) - } - if (err != NC_NOERR) { - if (coll_indep == NC_REQ_COLL) { - DEBUG_ASSIGN_ERROR(status, err) - /* write nothing, but participate the collective call */ - buf_count = 0; - } - else - DEBUG_RETURN_ERROR(err) - } + if (mpireturn != MPI_SUCCESS || count == MPI_UNDEFINED) + /* In case of partial read/write, MPI_Get_elements() is supposed to be + * called to obtain the number of type map elements actually + * read/written in order to calculate the true read/write amount. Below + * skips this step and simply returns the partial read/write amount. + * See an example usage of MPI_Get_count() in Example 5.12 from MPI + * standard document. + */ + return NC_EFILE; - /* request size in bytes, may be > NC_MAX_INT */ - req_size = buf_count * btype_size; + return (MPI_Offset)count * type_size; +} - /* explicitly initialize mpistatus object to 0. For zero-length read, +/*----< ncmpio_file_read_at() >----------------------------------------------*/ +/* + * This function is independent. + */ +/* TODO: move check count against MAX_INT and call _c API */ +MPI_Offset +ncmpio_file_read_at(NC *ncp, + MPI_Offset offset, + void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR, mpireturn; + MPI_Offset amnt=0; + MPI_Status mpistatus; + + /* explicitly initialize mpistatus object to 0. For zero-length read/write, * MPI_Get_count may report incorrect result for some MPICH version, * due to the uninitialized MPI_Status object passed to MPI-IO calls. - * Thus we initialize it above to work around. + * Thus we initialize it above to work around. See MPICH ticket: + * https://trac.mpich.org/projects/mpich/ticket/2332 */ memset(&mpistatus, 0, sizeof(MPI_Status)); - if (coll_indep == NC_REQ_COLL) - fh = ncp->collective_fh; - else - fh = ncp->independent_fh; + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + MPI_File fh; - if (rw_flag == NC_REQ_RD) { - void *xbuf=buf; - MPI_Datatype xbuf_type=buf_type; + fh = fIsSet(ncp->flags, NC_MODE_INDEP) + ? ncp->independent_fh : ncp->collective_fh; #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count xlen = (MPI_Count)buf_count; + MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1; + + TRACE_IO(MPI_File_read_at_c, (fh, offset, buf, count, buf_view.type, + &mpistatus)); #else - int xlen = (int)buf_count; + int count = (buf_view.is_contig) ? buf_view.size : 1; - if (buf_count > NC_MAX_INT) { - if (coll_indep == NC_REQ_COLL) { + if (buf_view.size > NC_MAX_INT) { #ifdef PNETCDF_DEBUG - fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buf_count="OFFFMT"\n", - ncp->rank, __func__,__LINE__,buf_count); + fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buffer size="OFFFMT"\n", + ncp->rank, __func__,__LINE__,buf_view.size); #endif - DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW) - /* write nothing, but participate the collective call */ - xlen = 0; - } - else - DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) + DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) } + TRACE_IO(MPI_File_read_at, (fh, offset, buf, count, buf_view.type, + &mpistatus)); #endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD) + } + + /* update the number of bytes read since file open */ + if (err == NC_NOERR) + amnt = get_count(&mpistatus, buf_view.type); + } + else + amnt = PNCIO_File_read_at(ncp->pncio_fh, offset, buf, buf_view); + + /* update the number of bytes read since file open */ + if (amnt >= 0) ncp->get_size += amnt; + /* else: ignore if error, as this error is not fatal */ + + return amnt; +} + +/*----< ncmpio_file_read_at_all() >------------------------------------------*/ +/* + * This function is collective. + */ +MPI_Offset +ncmpio_file_read_at_all(NC *ncp, + MPI_Offset offset, + void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR, mpireturn; + MPI_Offset amnt=0; + MPI_Status mpistatus; + + /* Explicitly initialize mpistatus object to 0. For zero-length read/write, + * MPI_Get_count may report incorrect result for some MPICH version, + * due to the uninitialized MPI_Status object passed to MPI-IO calls. + * Thus we initialize it above to work around. See MPICH ticket: + * https://trac.mpich.org/projects/mpich/ticket/2332 + */ + memset(&mpistatus, 0, sizeof(MPI_Status)); + + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + MPI_File fh; + + fh = fIsSet(ncp->flags, NC_MODE_INDEP) + ? ncp->independent_fh : ncp->collective_fh; - if (xlen > 0 && !buftype_is_contig && req_size <= ncp->ibuf_size) { - /* if read buffer is noncontiguous and size is < ncp->ibuf_size, - * allocate a temporary buffer and use it to read, as some MPI, - * e.g. Cray on KNL, can be significantly slow when read buffer is - * noncontiguous. - */ #ifdef HAVE_MPI_LARGE_COUNT - xbuf_type = MPI_BYTE; - xlen = (MPI_Count)req_size; + MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1; + + TRACE_IO(MPI_File_read_at_all_c, (fh, offset, buf, count, + buf_view.type, &mpistatus)); #else - if (req_size > NC_MAX_INT) { - mpireturn = MPI_Type_contiguous(xlen, buf_type, &xbuf_type); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_contiguous"); - if (coll_indep == NC_REQ_COLL) - DEBUG_ASSIGN_ERROR(status, err) - else - DEBUG_RETURN_ERROR(err) - } - MPI_Type_commit(&xbuf_type); - xlen = 1; - } - else { - xbuf_type = MPI_BYTE; - xlen = (int)req_size; - } + int count = (buf_view.is_contig) ? buf_view.size : 1; + + if (buf_view.size > NC_MAX_INT) { +#ifdef PNETCDF_DEBUG + fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buffer size="OFFFMT"\n", + ncp->rank, __func__,__LINE__,buf_view.size); #endif - xbuf = NCI_Malloc((size_t)req_size); + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) + /* participate the collective call, but read nothing */ + count = 0; + } + TRACE_IO(MPI_File_read_at_all, (fh, offset, buf, count, + buf_view.type, &mpistatus)); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD) } - if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) { + /* update the number of bytes read since file open */ + if (err == NC_NOERR) + amnt = get_count(&mpistatus, buf_view.type); + } + else + amnt = PNCIO_File_read_at_all(ncp->pncio_fh, offset, buf, buf_view); + + /* update the number of bytes read since file open */ + if (amnt >= 0) ncp->get_size += amnt; + /* else: ignore if error, as this error is not fatal */ + + return amnt; +} + +/*----< ncmpio_file_write_at() >---------------------------------------------*/ +/* + * This function is independent. + */ +MPI_Offset +ncmpio_file_write_at(NC *ncp, + MPI_Offset offset, + const void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR, mpireturn; + MPI_Offset amnt=0; + MPI_Status mpistatus; + + /* Explicitly initialize mpistatus object to 0. For zero-length read/write, + * MPI_Get_count may report incorrect result for some MPICH version, + * due to the uninitialized MPI_Status object passed to MPI-IO calls. + * Thus we initialize it above to work around. See MPICH ticket: + * https://trac.mpich.org/projects/mpich/ticket/2332 + */ + memset(&mpistatus, 0, sizeof(MPI_Status)); + + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + MPI_File fh; + + fh = fIsSet(ncp->flags, NC_MODE_INDEP) + ? ncp->independent_fh : ncp->collective_fh; + #ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_read_at_all_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1; + + TRACE_IO(MPI_File_write_at_c, (fh, offset, buf, count, buf_view.type, + &mpistatus)); #else - TRACE_IO(MPI_File_read_at_all, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + int count = (buf_view.is_contig) ? buf_view.size : 1; + + if (buf_view.size > NC_MAX_INT) { +#ifdef PNETCDF_DEBUG + fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buffer size="OFFFMT"\n", + ncp->rank, __func__,__LINE__,buf_view.size); #endif - } else { + DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) + } + TRACE_IO(MPI_File_write_at, (fh, offset, buf, count, buf_view.type, + &mpistatus)); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EWRITE) + } + + if (err == NC_NOERR) + amnt = get_count(&mpistatus, buf_view.type); + } + else + amnt = PNCIO_File_write_at(ncp->pncio_fh, offset, buf, buf_view); + + /* update the number of bytes written since file open */ + if (amnt >= 0) ncp->put_size += amnt; + /* else: ignore if error, as this error is not fatal */ + + return amnt; +} + +/*----< ncmpio_file_write_at_all() >-----------------------------------------*/ +/* + * This function is collective. + */ +MPI_Offset +ncmpio_file_write_at_all(NC *ncp, + MPI_Offset offset, + const void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR, mpireturn; + MPI_Offset amnt=0; + MPI_Status mpistatus; + + /* explicitly initialize mpistatus object to 0. For zero-length read/write, + * MPI_Get_count may report incorrect result for some MPICH version, + * due to the uninitialized MPI_Status object passed to MPI-IO calls. + * Thus we initialize it above to work around. See MPICH ticket: + * https://trac.mpich.org/projects/mpich/ticket/2332 + */ + memset(&mpistatus, 0, sizeof(MPI_Status)); + + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + MPI_File fh; + + fh = fIsSet(ncp->flags, NC_MODE_INDEP) + ? ncp->independent_fh : ncp->collective_fh; + #ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_read_at_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1; + + TRACE_IO(MPI_File_write_at_all_c, (fh, offset, buf, count, + buf_view.type, &mpistatus)); #else - TRACE_IO(MPI_File_read_at, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + int count = (buf_view.is_contig) ? buf_view.size : 1; + + if (buf_view.size > NC_MAX_INT) { +#ifdef PNETCDF_DEBUG + fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buffer size="OFFFMT"\n", + ncp->rank, __func__,__LINE__,buf_view.size); #endif + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) + /* participate the collective call, but write nothing */ + count = 0; } + TRACE_IO(MPI_File_write_at_all, (fh, offset, buf, count, + buf_view.type, &mpistatus)); +#endif if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) { - err = (err == NC_EFILE) ? NC_EREAD : err; - DEBUG_ASSIGN_ERROR(status, err) - } + if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EWRITE) + } + + if (err == NC_NOERR) + amnt = get_count(&mpistatus, buf_view.type); + } + else + amnt = PNCIO_File_write_at_all(ncp->pncio_fh, offset, buf, buf_view); + + /* update the number of bytes written since file open */ + if (amnt >= 0) ncp->put_size += amnt; + /* else: ignore if error, as this error is not fatal */ + + return amnt; +} + +/*----< ncmpio_getput_zero_req() >-------------------------------------------*/ +/* This function is called when this process has zero-length I/O request and + * must participate all the MPI collective calls involved in the collective + * APIs and wait_all(), which include setting fileview, collective read/write, + * another setting fileview. + * + * This function is collective. + */ +int +ncmpio_getput_zero_req(NC *ncp, int reqMode) +{ + int err, status=NC_NOERR; + MPI_Offset rlen, wlen; + PNCIO_View buf_view; + + buf_view.size = 0; + + /* When intra-node aggregation is enabled, non-aggregators do not access + * the file. + */ + if (ncp->num_aggrs_per_node > 0 && ncp->rank != ncp->my_aggr) + return NC_NOERR; + + /* do nothing if this came from an independent API */ + if (fIsSet(reqMode, NC_REQ_INDEP)) return NC_NOERR; + + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + if (status == NC_NOERR) status = err; + + if (fIsSet(reqMode, NC_REQ_RD)) { + if (ncp->nprocs > 1) + rlen = ncmpio_file_read_at_all(ncp, 0, NULL, buf_view); + else + rlen = ncmpio_file_read_at(ncp, 0, NULL, buf_view); + if (status == NC_NOERR && rlen < 0) status = (int)rlen; + } + else { /* write request */ + if (ncp->nprocs > 1) + wlen = ncmpio_file_write_at_all(ncp, 0, NULL, buf_view); + else + wlen = ncmpio_file_write_at(ncp, 0, NULL, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + } + + /* Reset fileview. Note fileview is never reused in PnetCDF */ + ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + + /* No longer need to reset the file view, as the root's fileview includes + * the whole file header. + */ + + return status; +} + +/*----< ncmpio_read_write() >------------------------------------------------*/ +int +ncmpio_read_write(NC *ncp, + int rw_flag, /* NC_REQ_WR or NC_REQ_RD */ + MPI_Offset offset, + PNCIO_View buf_view, + void *buf) +{ + char *mpi_name; + int i, status=NC_NOERR, err=NC_NOERR, mpireturn, coll_indep; + int to_free_buftype=0; + MPI_Offset rlen, wlen; + + coll_indep = NC_REQ_INDEP; + if (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP)) + coll_indep = NC_REQ_COLL; + + /* for zero-sized request */ + if (buf_view.size == 0) { + if (coll_indep == NC_REQ_INDEP) + return NC_NOERR; + + if (rw_flag == NC_REQ_RD) { + rlen = ncmpio_file_read_at_all(ncp, 0, NULL, buf_view); + if (rlen < 0) status = (int)rlen; } else { - /* update the number of bytes read since file open */ -#ifdef HAVE_MPI_GET_COUNT_C - MPI_Count get_size; - MPI_Get_count_c(&mpistatus, MPI_BYTE, &get_size); - ncp->get_size += get_size; -#else - int get_size; - mpireturn = MPI_Get_count(&mpistatus, xbuf_type, &get_size); - if (mpireturn != MPI_SUCCESS || get_size == MPI_UNDEFINED) - ncp->get_size += req_size; - else { -#ifdef HAVE_MPI_TYPE_SIZE_X - /* MPI_Type_size_x is introduced in MPI 3.0 */ - mpireturn = MPI_Type_size_x(xbuf_type, &btype_size); -#else - mpireturn = MPI_Type_size(xbuf_type, &btype_size); + wlen = ncmpio_file_write_at_all(ncp, 0, NULL, buf_view); + if (wlen < 0) status = (int)wlen; + } + goto fn_exit; + } + + /* buf_view.count is the number of offset-length pairs */ + + /* buf_view.size is in bytes, may be > NC_MAX_INT */ + + if (rw_flag == NC_REQ_RD) { + void *xbuf=buf; + +#ifndef HAVE_MPI_LARGE_COUNT + if (buf_view.size > NC_MAX_INT) { +#ifdef PNETCDF_DEBUG + fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buffer size="OFFFMT"\n", + ncp->rank, __func__,__LINE__,buf_view.size); #endif - if (mpireturn != MPI_SUCCESS || get_size == MPI_UNDEFINED) - ncp->get_size += req_size; - else - ncp->get_size += btype_size * get_size; + if (coll_indep == NC_REQ_COLL) { + DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW) + /* write nothing, but participate the collective call */ + buf_view.size = 0; } + else + DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) + } #endif + +// printf("%s at %d: buf_view count=%lld type=%s size=%lld\n",__func__,__LINE__, buf_view.count, (buf_view.type==MPI_BYTE)?"MPI_BYTE":"NOT MPI_BYTE", buf_view.size); + + if (!buf_view.is_contig && buf_view.size <= ncp->ibuf_size) { + /* The only case of read buffer being noncontiguous is when + * nonblocking API ncmpi_wait/wait_all() is called and INA is + * disabled. If read buffer is noncontiguous and size is < + * ncp->ibuf_size, we allocate a temporary contiguous buffer and + * use it to read. Later it is unpacked to user buffer. As some + * MPI, e.g. Cray on KNL, can be significantly slow when write + * buffer is noncontiguous. + * + * Note ncp->ibuf_size is never > NC_MAX_INT. + */ + xbuf = NCI_Malloc(buf_view.size); + buf_view.type = MPI_BYTE; + buf_view.is_contig = 1; } - if (xbuf != buf) { /* unpack contiguous xbuf to noncontiguous buf */ + + if (!buf_view.is_contig && ncp->fstype == PNCIO_FSTYPE_MPIIO) { + /* construct a buftype */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count pos=0; - mpireturn = MPI_Unpack_c(xbuf, xlen, &pos, buf, (MPI_Count)buf_count, - buf_type, MPI_COMM_SELF); - mpi_name = "MPI_Unpack_c"; + /* TODO: MPI_Type_create_hindexed_c + * buf_view.count should be of type MPI_Count + * buf_view.len should be of type MPI_Count + * buf_view.off should be of type MPI_Count + */ + mpireturn = MPI_Type_create_hindexed_c(buf_view.count, + buf_view.len, + buf_view.off, + MPI_BYTE, &buf_view.type); + mpi_name = "MPI_Type_create_hindexed_c"; +#else + MPI_Aint *disp; +#if SIZEOF_MPI_AINT == SIZEOF_MPI_OFFSET + disp = (MPI_Aint*) buf_view.off; #else - int pos=0; - mpireturn = MPI_Unpack(xbuf, xlen, &pos, buf, (int)buf_count, - buf_type, MPI_COMM_SELF); - mpi_name = "MPI_Unpack"; + disp = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * buf_view.count); + for (j=0; jnprocs > 1 && coll_indep == NC_REQ_COLL) + rlen = ncmpio_file_read_at_all(ncp, offset, xbuf, buf_view); + else + rlen = ncmpio_file_read_at(ncp, offset, xbuf, buf_view); + if (status == NC_NOERR && rlen < 0) status = (int)rlen; + + if (xbuf != buf) { /* unpack contiguous xbuf to noncontiguous buf */ + char *in_ptr, *out_ptr; + in_ptr = xbuf; + +#if 0 + long long *wkl, nelems; int j; + wkl = (long long*) malloc(buf_view.size); + nelems=buf_view.size/8; + memcpy(wkl, xbuf, nelems*8); ncmpii_in_swapn(wkl, nelems, 8); + printf("%s at %d: nelems=%lld xbuf=(%p) ",__func__,__LINE__, nelems, xbuf); + for (i=0; i NC_MAX_INT) { - if (coll_indep == NC_REQ_COLL) { -#ifdef PNETCDF_DEBUG - fprintf(stderr,"%d: %s line %d: NC_EINTOVERFLOW buf_count="OFFFMT"\n", - ncp->rank, __func__,__LINE__,buf_count); + if (!buf_view.is_contig && buf_view.size <= ncp->ibuf_size) { + /* The only case of write buffer being noncontiguous is when + * nonblocking API ncmpi_wait/wait_all() is called and INA is + * disabled. If write buffer is noncontiguous and size is < + * ncp->ibuf_size, pack it a temporary contiguous buffer and use it + * to write. As some MPI, e.g. Cray on KNL, can be significantly + * slow when write buffer is noncontiguous. + * + * Note ncp->ibuf_size is never > NC_MAX_INT. + */ + char *in_ptr, *out_ptr; + xbuf = NCI_Malloc(buf_view.size); + out_ptr = xbuf; +assert(buf != NULL); +// printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size); + +#if 0 +printf("%s at %d: buf = %p\n",__func__,__LINE__, buf); +printf("%s at %d: buf_view count=%lld off=%lld %lld len=%lld %lld\n",__func__,__LINE__, buf_view.count,buf_view.off[0],buf_view.off[1],buf_view.len[0],buf_view.len[1]); +int wkl[21]; #endif - DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW) - /* write nothing, but participate the collective call */ - xlen = 0; + for (i=0; i 0 && !buftype_is_contig && req_size <= ncp->ibuf_size) { - /* if write buffer is noncontiguous and size is < ncp->ibuf_size, - * allocate a temporary buffer and use it to write, as some MPI, - * e.g. Cray on KNL, can be significantly slow when write buffer is - * noncontiguous. - */ + if (!buf_view.is_contig && ncp->fstype == PNCIO_FSTYPE_MPIIO) { + /* construct a buftype */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count pos=0; - xbuf_type = MPI_BYTE; - xlen = (MPI_Count)req_size; - xbuf = NCI_Malloc(req_size); - mpireturn = MPI_Pack_c(buf, (MPI_Count)buf_count, buf_type, xbuf, - (MPI_Count)req_size, &pos, MPI_COMM_SELF); - mpi_name = "MPI_Pack_c"; + /* TODO: MPI_Type_create_hindexed_c + * buf_view.count should be of type MPI_Count + * buf_view.len should be of type MPI_Count + * buf_view.off should be of type MPI_Count + */ + mpireturn = MPI_Type_create_hindexed_c(buf_view.count, + buf_view.len, + buf_view.off, + MPI_BYTE, &buf_view.type); + mpi_name = "MPI_Type_create_hindexed_c"; #else - if (req_size > NC_MAX_INT) { - /* skip packing write data into a temp buffer */ - xlen = (int)buf_count; - xbuf_type = buf_type; - mpireturn = MPI_SUCCESS; - } - else { - int pos=0; - xbuf_type = MPI_BYTE; - xlen = (int)req_size; - xbuf = NCI_Malloc(xlen); - mpireturn = MPI_Pack(buf, (int)buf_count, buf_type, xbuf, - xlen, &pos, MPI_COMM_SELF); - mpi_name = "MPI_Pack"; - } + MPI_Aint *disp; +#if SIZEOF_MPI_AINT == SIZEOF_MPI_OFFSET + disp = (MPI_Aint*) buf_view.off; +#else + disp = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * buf_view.count); + for (j=0; jnprocs > 1 && coll_indep == NC_REQ_COLL) { + if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) + wlen = ncmpio_file_write_at_all(ncp, offset, xbuf, buf_view); + else + wlen = ncmpio_file_write_at(ncp, offset, xbuf, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + + if (xbuf != buf) NCI_Free(xbuf); + if (to_free_buftype) + MPI_Type_free(&buf_view.type); + } + +fn_exit: + /* Reset fileview. Note fileview is never reused in PnetCDF */ + ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + + return status; +} + +/*----< ncmpio_file_close() >------------------------------------------------*/ +/* + * This function is collective. + */ +int +ncmpio_file_close(NC *ncp) +{ + int err=NC_NOERR; + + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + int mpireturn; + + if (ncp->independent_fh != ncp->collective_fh && + ncp->independent_fh != MPI_FILE_NULL) { + TRACE_IO(MPI_File_close, (&ncp->independent_fh)); + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + + if (ncp->collective_fh != MPI_FILE_NULL) { + TRACE_IO(MPI_File_close, (&ncp->collective_fh)); + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + } + else { + /* When intra-node aggregation is enabled, only aggregators have a + * non-NULL ncp->pncio_fh and non-aggregators has pncio_fh == NULL. + */ + if (ncp->pncio_fh != NULL) { + err = PNCIO_File_close(ncp->pncio_fh); + NCI_Free(ncp->pncio_fh); + ncp->pncio_fh = NULL; + } + } + + return err; +} + +/*----< ncmpio_file_delete() >-----------------------------------------------*/ +/* + * This function is collective. + * + * This subroutine is called only from ncmpi_abort. When the file is being + * created and an error occurs, the program is still in define mode. In this + * case, the file is deleted. + */ +int +ncmpio_file_delete(NC *ncp) +{ + int err=NC_NOERR; + + if (ncp->rank == 0) { + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + int mpireturn; + TRACE_IO(MPI_File_delete, ((char *)ncp->path, ncp->mpiinfo)); + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + else + err = PNCIO_File_delete(ncp->path); + } + + if (ncp->nprocs > 1) + MPI_Bcast(&err, 1, MPI_INT, 0, ncp->comm); + + return err; +} + +/*----< ncmpio_file_sync() >-------------------------------------------------*/ +/* This function must be called collectively, no matter if it is in collective + * or independent data mode. + */ +int +ncmpio_file_sync(NC *ncp) { + char *mpi_name; + int mpireturn; + + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + if (ncp->pncio_fh == NULL) + return NC_NOERR; + return PNCIO_File_sync(ncp->pncio_fh); + } + + /* the remaining of this subroutine are for when using MPI-IO */ + + if (ncp->independent_fh != MPI_FILE_NULL) { + TRACE_IO(MPI_File_sync, (ncp->independent_fh)); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + /* when nprocs == 1, ncp->collective_fh == ncp->independent_fh */ + if (ncp->nprocs == 1) return NC_NOERR; + + /* When intra-node aggregation is enabled, non-aggregator's + * ncp->collective_fh is always MPI_FILE_NULL. When disabled, + * ncp->collective_fh on all ranks is never MPI_FILE_NULL as collective + * mode is default in PnetCDF. + */ + if (ncp->collective_fh != MPI_FILE_NULL) { + TRACE_IO(MPI_File_sync, (ncp->collective_fh)); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + + /* Barrier is not necessary ... + TRACE_COMM(MPI_Barrier)(ncp->comm); + */ + + return NC_NOERR; +} + +/*----< ncmpio_file_set_view() >---------------------------------------------*/ +/* This subroutine is collective when using MPI-IO. When using internal PNCIO + * driver, this subroutine is independent. + */ +int +ncmpio_file_set_view(const NC *ncp, + MPI_Offset disp, /* IN/OUT */ + MPI_Datatype filetype, + MPI_Aint npairs, #ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_all_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + MPI_Count *offsets, + MPI_Count *lengths #else - TRACE_IO(MPI_File_write_at_all, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + MPI_Offset *offsets, + int *lengths #endif - } else { +) +{ + char *mpi_name; + int err, mpireturn, status=NC_NOERR; + MPI_File fh; + +assert(filetype == MPI_BYTE); +assert(disp == 0); + + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + /* Skip setting fileview for ranks whose pncio_fh is NULL */ + if (ncp->pncio_fh == NULL) + return NC_NOERR; + + /* When PnetCDF's internal PNCIO driver is used, the request has been + * flattened into offsets and lengths. Thus passed-in filetype is not + * constructed. Note offsets and lengths are not relative to any MPI-IO + * fileview. They will be reused in PNCIO driver as a flattened file + * type struct, which avoids repeated work of constructing and + * flattening the filetype. + */ + return PNCIO_File_set_view(ncp->pncio_fh, disp, filetype, npairs, + offsets, lengths); + } + + /* Now, ncp->fstype == PNCIO_FSTYPE_MPIIO, i.e. using MPI-IO. */ + int to_free_filetype=0; + + /* when ncp->nprocs == 1, ncp->collective_fh == ncp->independent_fh */ + fh = (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP)) + ? ncp->collective_fh : ncp->independent_fh; + + if (fh == MPI_FILE_NULL) /* not INA aggregator */ + return NC_NOERR; + + if (npairs == 0) /* zero-sized requests */ + filetype = MPI_BYTE; + else { #ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + /* construct fileview */ + mpireturn = MPI_Type_create_hindexed_c(npairs, lengths, offsets, + MPI_BYTE, &filetype); #else - TRACE_IO(MPI_File_write_at, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus)); + assert(sizeof(*offsets) == sizeof(MPI_Aint)); + /* construct fileview */ + mpireturn = MPI_Type_create_hindexed(npairs, lengths, + (MPI_Aint*)offsets, + MPI_BYTE, &filetype); #endif - } if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); /* return the first encountered error if there is any */ - if (status == NC_NOERR) { - err = (err == NC_EFILE) ? NC_EWRITE : err; - DEBUG_ASSIGN_ERROR(status, err) - } + if (status == NC_NOERR) status = err; } else { - /* update the number of bytes written since file open */ -#ifdef HAVE_MPI_GET_COUNT_C - MPI_Count put_size; - MPI_Get_count_c(&mpistatus, MPI_BYTE, &put_size); - ncp->put_size += put_size; -#else - int put_size; - mpireturn = MPI_Get_count(&mpistatus, xbuf_type, &put_size); - if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED) - ncp->put_size += req_size; - else { -#ifdef HAVE_MPI_TYPE_SIZE_X - /* MPI_Type_size_x is introduced in MPI 3.0 */ - mpireturn = MPI_Type_size_x(xbuf_type, &btype_size); -#else - mpireturn = MPI_Type_size(xbuf_type, &btype_size); -#endif - if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED) - ncp->put_size += req_size; - else - ncp->put_size += btype_size * put_size; + mpireturn = MPI_Type_commit(&filetype); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; } -#endif + else + to_free_filetype = 1; } - if (xbuf != buf) NCI_Free(xbuf); - if (xbuf_type != buf_type && xbuf_type != MPI_BYTE) - MPI_Type_free(&xbuf_type); } + TRACE_IO(MPI_File_set_view, (fh, disp, MPI_BYTE, filetype, "native", + MPI_INFO_NULL)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (status == NC_NOERR) status = err; +assert(0); + } + + if (to_free_filetype) + MPI_Type_free(&filetype); + return status; } +/*----< ncmpio_file_open() >-------------------------------------------------*/ +int +ncmpio_file_open(NC *ncp, + MPI_Comm comm, + const char *path, + int omode, + MPI_Info info) +{ + int err=NC_NOERR; + + /* open file collectively */ + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + char *mpi_name; + int mpireturn; + MPI_File fh; + + TRACE_IO(MPI_File_open, (comm, path, omode, info, &fh)); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, mpi_name); + + /* Now the file has been successfully opened */ + ncp->collective_fh = fh; + ncp->independent_fh = (ncp->nprocs > 1) ? MPI_FILE_NULL : fh; + + /* get the I/O hints used/modified by MPI-IO */ + TRACE_IO(MPI_File_get_info, (fh, &ncp->mpiinfo)); + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + } + else { /* ncp->fstype != PNCIO_FSTYPE_MPIIO */ + ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); + + err = PNCIO_File_open(comm, path, omode, info, ncp->pncio_fh); + if (err != NC_NOERR) return err; + + /* Now the file has been successfully opened, obtain the I/O hints + * used/modified by PNCIO driver. + */ + err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo); + } + + return err; +} + diff --git a/src/drivers/ncmpio/ncmpio_file_misc.c b/src/drivers/ncmpio/ncmpio_file_misc.c index 932b5027f..d936aa643 100644 --- a/src/drivers/ncmpio/ncmpio_file_misc.c +++ b/src/drivers/ncmpio/ncmpio_file_misc.c @@ -81,8 +81,7 @@ dup_NC(const NC *ref) int ncmpio_redef(void *ncdp) { - char *mpi_name; - int err, status=NC_NOERR, mpireturn; + int err, status=NC_NOERR; NC *ncp = (NC*)ncdp; #if 0 @@ -100,7 +99,7 @@ ncmpio_redef(void *ncdp) if (NC_indep(ncp)) /* exit independent mode, if in independent mode */ ncmpio_end_indep_data(ncp); - /* duplicate a header to be used in enddef() for checking if header grows */ + /* duplicate header to be used in enddef() for checking if header grows */ ncp->old = dup_NC(ncp); if (ncp->old == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) @@ -108,21 +107,8 @@ ncmpio_redef(void *ncdp) fSet(ncp->flags, NC_MODE_DEF); /* must reset fileview as header extent may later change in enddef() */ - TRACE_IO(MPI_File_set_view, (ncp->collective_fh, 0, MPI_BYTE, - MPI_BYTE, "native", MPI_INFO_NULL)); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - DEBUG_ASSIGN_ERROR(status, err) - } - - if (ncp->independent_fh != MPI_FILE_NULL) { - TRACE_IO(MPI_File_set_view, (ncp->independent_fh, 0, MPI_BYTE, - MPI_BYTE, "native", MPI_INFO_NULL)); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - DEBUG_ASSIGN_ERROR(status, err) - } - } + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + DEBUG_ASSIGN_ERROR(status, err) return status; } @@ -132,7 +118,6 @@ ncmpio_redef(void *ncdp) int ncmpio_begin_indep_data(void *ncdp) { - char *mpi_name; NC *ncp = (NC*)ncdp; if (NC_indef(ncp)) /* must not be in define mode */ @@ -151,6 +136,66 @@ ncmpio_begin_indep_data(void *ncdp) /* raise independent flag */ fSet(ncp->flags, NC_MODE_INDEP); + /* Barrier is necessary to prevent non-aggregators from calling open() + * before the file is being collectively created by the aggregators. + */ + MPI_Barrier(ncp->comm); + + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + /* When using PnetCDF's PNCIO driver, there are 2 scenarios: + * 1. When intra-node aggregation (INA) is enabled, at the end of + * ncmpi_create/ncmpi_open, non-aggregators' pncio_fh are NULL. Thus + * switching to independent data mode, we can re-use pncio_fh to + * store file handler of file opened with MPI_COMM_SELF. Note + * whether pncio_fh is NULL or not does not tell whether INA is + * enabled or not. + * 2. When INA is disabled, all ranks calls PNCIO_File_open() and thus + * pncio_fh should not be NULL. In other word, this scenario should + * not reach here at all. Because PnetCDF's PNCIO driver relaxes + * File_setview subroutine to be able to called independently, the + * same pncio_fh can be used for both collective and independent I/O + * APIs. Note we cannot re-used pncio_fh for the above scenario 1, + * because in the collective data mode, all ranks must participate + * each collective I/O call, + */ + int err; + char *filename; + + if (ncp->pncio_fh != NULL) + /* Only INA non-aggregators' pncio_fh can be NULL, because + * aggregators open the file collectively and their pncio_fh can + * never be NULL. + */ + return NC_NOERR; + + filename = ncmpii_remove_file_system_type_prefix(ncp->path); + + ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); + ncp->pncio_fh->file_system = ncp->fstype; + ncp->pncio_fh->num_nodes = 1; + ncp->pncio_fh->node_ids = (int*) NCI_Malloc(sizeof(int)); + ncp->pncio_fh->node_ids[0] = 0; + + int omode = fClr(ncp->mpiomode, MPI_MODE_CREATE); + + err = PNCIO_File_open(MPI_COMM_SELF, filename, omode, ncp->mpiinfo, + ncp->pncio_fh); + if (err != NC_NOERR) + return err; + + /* get the I/O hints used/modified by MPI-IO */ + err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo); + if (err != NC_NOERR) return err; + + /* Add PnetCDF hints into ncp->mpiinfo */ + ncmpio_hint_set(ncp, ncp->mpiinfo); + + NCI_Free(ncp->pncio_fh->node_ids); + ncp->pncio_fh->node_ids = NULL; + + return NC_NOERR; + } + /* PnetCDF's default mode is collective. MPI file handle, collective_fh, * will never be MPI_FILE_NULL. We must use a separate MPI file handle * opened with MPI_COMM_SELF, because MPI_File_set_view is a collective @@ -159,12 +204,20 @@ ncmpio_begin_indep_data(void *ncdp) * called. */ if (ncp->independent_fh == MPI_FILE_NULL) { + char *mpi_name; int mpireturn; - TRACE_IO(MPI_File_open, (MPI_COMM_SELF, ncp->path, - ncp->mpiomode, ncp->mpiinfo, - &ncp->independent_fh)); + TRACE_IO(MPI_File_open, (MPI_COMM_SELF, ncp->path, ncp->mpiomode, + ncp->mpiinfo, &ncp->independent_fh)); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, mpi_name); + + /* get the I/O hints used/modified by MPI-IO */ + mpireturn = MPI_File_get_info(ncp->independent_fh, &ncp->mpiinfo); if (mpireturn != MPI_SUCCESS) return ncmpii_error_mpi2nc(mpireturn, mpi_name); + + /* Copy MPI-IO hints into ncp->mpiinfo */ + ncmpio_hint_set(ncp, ncp->mpiinfo); } return NC_NOERR; } @@ -242,9 +295,14 @@ ncmpio_abort(void *ncdp) } /* close the file */ - err = ncmpio_close_files(ncp, doUnlink); + err = ncmpio_file_close(ncp); if (status == NC_NOERR ) status = err; + if (doUnlink) { + err = ncmpio_file_delete(ncp); + status = (status == NC_NOERR) ? err : status; + } + /* free up space occupied by the header metadata */ ncmpio_free_NC(ncp); @@ -444,12 +502,23 @@ int ncmpi_delete(const char *filename, MPI_Info info) { + int err = NC_NOERR; +#ifdef MIMIC_LUSTRE + char *path = ncmpii_remove_file_system_type_prefix(filename); + err = unlink(path); + if (err != 0) + err = ncmpii_error_posix2nc("unlink"); +#else + err = PNCIO_File_delete(filename); +#if 0 char *mpi_name; - int err=NC_NOERR, mpireturn; + int mpireturn; - TRACE_IO(MPI_File_delete, ((char*)filename, info)); + TRACE_IO(MPI_File_delete, (filename, info)); if (mpireturn != MPI_SUCCESS) err = ncmpii_error_mpi2nc(mpireturn, mpi_name); +#endif +#endif return err; } diff --git a/src/drivers/ncmpio/ncmpio_filetype.c b/src/drivers/ncmpio/ncmpio_filetype.c index 828ab4132..3d84d5407 100644 --- a/src/drivers/ncmpio/ncmpio_filetype.c +++ b/src/drivers/ncmpio/ncmpio_filetype.c @@ -506,6 +506,9 @@ ncmpio_filetype_create_vars(const NC *ncp, MPI_Offset i, nblocks, nelems, *blocklens; MPI_Datatype filetype=MPI_BYTE; +/* This is no longer used, as all requests go to INA subroutines to flatten. */ +assert(0); + if (stride == NULL) return filetype_create_vara(ncp, varp, start, count, offset_ptr, filetype_ptr, is_filetype_contig); @@ -606,105 +609,3 @@ ncmpio_filetype_create_vars(const NC *ncp, return err; } -/*----< ncmpio_file_set_view() >---------------------------------------------*/ -/* This function handles the special case for root process for setting its - * file view: to keeps the whole file header visible to the root process. This - * is because the root process may update the number of records or attributes - * into the file header while in data mode. In PnetCDF design, only root - * process can read/write the file header. - * This function is collective if called in collective data mode - */ -int -ncmpio_file_set_view(const NC *ncp, - MPI_File fh, - MPI_Offset *offset, /* IN/OUT */ - MPI_Datatype filetype) -{ - char *mpi_name; - int err, mpireturn, status=NC_NOERR; - - if (filetype == MPI_BYTE) { - /* filetype is a contiguous space, make the whole file visible */ - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, - "native", MPI_INFO_NULL)); - return NC_NOERR; - } - - if (ncp->rank == 0) { - /* prepend the whole file header to filetype */ - MPI_Datatype root_filetype=MPI_BYTE, ftypes[2]; -#ifdef HAVE_MPI_LARGE_COUNT - MPI_Count blocklens[2]; - MPI_Count disps[2]; - blocklens[0] = ncp->begin_var; -#else - int blocklens[2]; - MPI_Aint disps[2]; - - /* check if header size > 2^31 */ - if (ncp->begin_var > NC_MAX_INT) { - DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW); - goto err_out; - } - - blocklens[0] = (int)ncp->begin_var; -#endif - - /* first block is the header extent */ - disps[0] = 0; - ftypes[0] = MPI_BYTE; - - /* second block is filetype, the subarray request(s) to the variable */ - blocklens[1] = 1; - disps[1] = *offset; - ftypes[1] = filetype; - -#if !defined(HAVE_MPI_LARGE_COUNT) && (SIZEOF_MPI_AINT != SIZEOF_MPI_OFFSET) - if (*offset > NC_MAX_INT) { - DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW); - goto err_out; - } -#endif - -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_struct_c(2, blocklens, disps, ftypes, - &root_filetype); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct_c"); - if (status == NC_NOERR) status = err; - } -#else - mpireturn = MPI_Type_create_struct(2, blocklens, disps, ftypes, - &root_filetype); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct"); - if (status == NC_NOERR) status = err; - } -#endif - MPI_Type_commit(&root_filetype); - -#ifndef HAVE_MPI_LARGE_COUNT -err_out: -#endif - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, root_filetype, "native", - MPI_INFO_NULL)); - if (root_filetype != MPI_BYTE) - MPI_Type_free(&root_filetype); - - /* now update the explicit offset to be used in MPI-IO call later */ - *offset = ncp->begin_var; - } - else { - TRACE_IO(MPI_File_set_view, (fh, *offset, MPI_BYTE, filetype, "native", - MPI_INFO_NULL)); - /* the explicit offset is already set in fileview */ - *offset = 0; - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) status = err; - } - - return status; -} - diff --git a/src/drivers/ncmpio/ncmpio_fill.c b/src/drivers/ncmpio/ncmpio_fill.c index e392a366d..ef6040d1b 100644 --- a/src/drivers/ncmpio/ncmpio_fill.c +++ b/src/drivers/ncmpio/ncmpio_fill.c @@ -144,13 +144,33 @@ fill_var_rec(NC *ncp, NC_var *varp, MPI_Offset recno) /* record number */ { - char *mpi_name; int err, status=NC_NOERR, mpireturn; void *buf; - MPI_Offset var_len, start, count, offset; - MPI_File fh; - MPI_Status mpistatus; - MPI_Datatype bufType; + MPI_Offset var_len, start, count, offset, wlen; + PNCIO_View buf_view; + + buf_view.type = MPI_BYTE; + buf_view.count = 0; + buf_view.is_contig = 1; + buf_view.size = 0; + buf_view.off = NULL; + buf_view.len = NULL; + + /* When intra-node aggregation is enabled, use the communicator consisting + * of aggregators in comm, nprocs, and rank. Non-aggregators do not + * participate the fill operation. + */ + MPI_Comm comm = ncp->comm; + int nprocs = ncp->nprocs; + int rank = ncp->rank; + if (ncp->num_aggrs_per_node > 0) { + if (ncp->my_aggr != ncp->rank) + return NC_NOERR; + + comm = ncp->ina_comm; + nprocs = ncp->ina_nprocs; + rank = ncp->ina_rank; + } if (varp->ndims == 0) /* scalar variable */ var_len = 1; @@ -162,14 +182,14 @@ fill_var_rec(NC *ncp, var_len = varp->dsizes[0]; /* divide total number of elements of this variable among all processes */ - count = var_len / ncp->nprocs; - start = count * ncp->rank; - if (ncp->rank < var_len % ncp->nprocs) { - start += ncp->rank; + count = var_len / nprocs; + start = count * rank; + if (rank < var_len % nprocs) { + start += rank; count++; } else { - start += var_len % ncp->nprocs; + start += var_len % nprocs; } /* allocate buffer space */ @@ -179,64 +199,45 @@ fill_var_rec(NC *ncp, err = fill_var_buf(varp, count, buf); if (err != NC_NOERR) { NCI_Free(buf); - count = 0; /* still participate collective calls below */ + /* still participate collective calls below */ + buf_view.size = 0; status = err; } + /* make the entire file visible */ + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + status = (status == NC_NOERR) ? err : status; + /* calculate the starting file offset for each process */ offset = varp->begin; if (IS_RECVAR(varp)) offset += ncp->recsize * recno; offset += start * varp->xsz; - /* when ncp->nprocs == 1, we keep I/O mode in independent mode at all time */ - fh = ncp->collective_fh; - - /* make the entire file visible */ - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", - MPI_INFO_NULL)); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) status = err; - } - count *= varp->xsz; - bufType = MPI_BYTE; - #ifndef HAVE_MPI_LARGE_COUNT if (count > NC_MAX_INT) { DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) if (status == NC_NOERR) status = err; - count = 0; /* participate collective write with 0-length request */ + /* participate collective write with 0-length request */ + buf_view.size = 0; } #endif + if (status == NC_NOERR) + buf_view.size = count; + +// if (ncp->rank ==0) printf("%s at %d: buf_view count=%lld size=%lld offset=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size,offset); + /* write to variable collectively */ - if (ncp->nprocs > 1) { -#ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_all_c, (fh, offset, buf, (MPI_Count)count, - bufType, &mpistatus)); -#else - TRACE_IO(MPI_File_write_at_all, (fh, offset, buf, (int)count, - bufType, &mpistatus)); -#endif - } - else { -#ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_c, (fh, offset, buf, (MPI_Count)count, - bufType, &mpistatus)); -#else - TRACE_IO(MPI_File_write_at, (fh, offset, buf, (int)count, - bufType, &mpistatus)); -#endif - } + if (nprocs > 1) + wlen = ncmpio_file_write_at_all(ncp, offset, buf, buf_view); + else + wlen = ncmpio_file_write_at(ncp, offset, buf, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + NCI_Free(buf); - if (bufType != MPI_BYTE) MPI_Type_free(&bufType); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) status = err; - } if (status != NC_NOERR) return status; @@ -248,9 +249,9 @@ fill_var_rec(NC *ncp, * First, find the max numrecs among all processes. */ MPI_Offset max_numrecs=recno+1; - if (ncp->nprocs > 1) { + if (nprocs > 1) { TRACE_COMM(MPI_Allreduce)(MPI_IN_PLACE, &max_numrecs, 1, MPI_OFFSET, - MPI_MAX, ncp->comm); + MPI_MAX, comm); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn, "MPI_Allreduce"); if (status == NC_NOERR) status = err; @@ -363,24 +364,36 @@ fill_added_recs(NC *ncp, NC *old_ncp) static int fillerup_aggregate(NC *ncp, NC *old_ncp) { - int i, j, k, mpireturn, err, status=NC_NOERR; + int i, j, k, err, status=NC_NOERR; int start_vid, recno, nVarsFill; - char *buf_ptr, *noFill, *mpi_name; + char *buf_ptr, *noFill; void *buf; size_t nsegs; - MPI_Offset buf_len, var_len, nrecs, start, *count; - MPI_Datatype filetype, bufType; - MPI_File fh; - MPI_Status mpistatus; + MPI_Offset buf_len, var_len, nrecs, start, *count, wlen; NC_var *varp; + PNCIO_View buf_view; #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *blocklengths, *offset; + MPI_Count *blocklengths=NULL, *offset=NULL; #else - int *blocklengths; - MPI_Aint *offset; + int *blocklengths=NULL; + MPI_Offset *offset=NULL; #endif + /* When intra-node aggregation is enabled, use the communicator consisting + * of aggregators in comm, nprocs, and rank. Non-aggregators do not + * participate the fill operation. + */ + int nprocs = ncp->nprocs; + int rank = ncp->rank; + if (ncp->num_aggrs_per_node > 0) { + if (ncp->my_aggr != ncp->rank) + return NC_NOERR; + + nprocs = ncp->ina_nprocs; + rank = ncp->ina_rank; + } + /* find the starting vid for newly added variables */ start_vid = 0; nrecs = 0; /* the current number of records */ @@ -397,12 +410,16 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) * variables' fill modes and overwrite local's if an inconsistency is found * Note ncp->vars.ndefined is already made consistent by this point. */ - if (ncp->nprocs > 1) { + MPI_Comm comm = (ncp->num_aggrs_per_node > 0) ? ncp->ina_comm : ncp->comm; + + if (nprocs > 1) { + int mpireturn; + for (i=start_vid; ivars.ndefined; i++) noFill[i-start_vid] = (char)(ncp->vars.value[i]->no_fill); TRACE_COMM(MPI_Bcast)(noFill, (ncp->vars.ndefined - start_vid), - MPI_BYTE, 0, ncp->comm); + MPI_BYTE, 0, comm); if (mpireturn != MPI_SUCCESS) return ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast"); @@ -427,9 +444,9 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) nsegs = (size_t)(ncp->vars.ndefined + ncp->vars.num_rec_vars * nrecs); count = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * nsegs); #ifdef HAVE_MPI_LARGE_COUNT - offset = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nsegs); + offset = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nsegs); #else - offset = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * nsegs); + offset = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * nsegs); #endif /* calculate each segment's offset and count */ @@ -446,19 +463,23 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) else var_len = varp->dsizes[0]; /* divide evenly total number of variable's elements among processes */ - count[j] = var_len / ncp->nprocs; - start = count[j] * ncp->rank; - if (ncp->rank < var_len % ncp->nprocs) { - start += ncp->rank; + count[j] = var_len / nprocs; + start = count[j] * rank; + if (rank < var_len % nprocs) { + start += rank; count[j]++; } else - start += var_len % ncp->nprocs; + start += var_len % nprocs; /* calculate the starting file offset */ start *= varp->xsz; start += varp->begin; - offset[j] = (MPI_Aint)start; +#ifdef HAVE_MPI_LARGE_COUNT + offset[j] = (MPI_Count)start; +#else + offset[j] = start; +#endif if (start != offset[j]) { DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) if (status == NC_NOERR) status = err; @@ -483,19 +504,23 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) else var_len = varp->dsizes[1]; /* divide total number of variable's elements among all processes */ - count[j] = var_len / ncp->nprocs; - start = count[j] * ncp->rank; - if (ncp->rank < var_len % ncp->nprocs) { - start += ncp->rank; + count[j] = var_len / nprocs; + start = count[j] * rank; + if (rank < var_len % nprocs) { + start += rank; count[j]++; } else - start += var_len % ncp->nprocs; + start += var_len % nprocs; /* calculate the starting file offset */ start *= varp->xsz; start += varp->begin + ncp->recsize * recno; - offset[j] = (MPI_Aint)start; +#ifdef HAVE_MPI_LARGE_COUNT + offset[j] = (MPI_Count)start; +#else + offset[j] = start; +#endif if (start != offset[j]) { DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) if (status == NC_NOERR) status = err; @@ -597,53 +622,26 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) } /* k is the number of valid write requests */ NCI_Free(noFill); - - if (k == 0) { - filetype = MPI_BYTE; - } - else { - /* create fileview: a list of contiguous segment for each variable */ -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_hindexed_c(k, blocklengths, offset, - MPI_BYTE, &filetype); -#else - mpireturn = MPI_Type_create_hindexed(k, blocklengths, offset, - MPI_BYTE, &filetype); -#endif - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_hindexed"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; - } - else - MPI_Type_commit(&filetype); - } - - NCI_Free(blocklengths); NCI_Free(count); - NCI_Free(offset); - /* when nprocs == 1, we keep I/O mode in independent mode at all time */ - fh = ncp->collective_fh; + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, k, offset, blocklengths); + status = (status == NC_NOERR) ? err : status; - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, filetype, "native", - MPI_INFO_NULL)); - if (k > 0) MPI_Type_free(&filetype); - - bufType = MPI_BYTE; + buf_view.type = MPI_BYTE; if (buf_len > NC_MAX_INT) { #ifdef HAVE_MPI_LARGE_COUNT + int mpireturn; + mpireturn = MPI_Type_contiguous_c((MPI_Count)buf_len, MPI_BYTE, - &bufType); + &buf_view.type); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_contiguous_c"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; - buf_len = 0; + buf_view.size = 0; } else { - MPI_Type_commit(&bufType); - buf_len = 1; + MPI_Type_commit(&buf_view.type); } #else if (status == NC_NOERR) @@ -653,39 +651,38 @@ fillerup_aggregate(NC *ncp, NC *old_ncp) #endif } - /* write to variable collectively */ - if (ncp->nprocs > 1) { + MPI_Offset off=0; #ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_all_c, (fh, 0, buf, (MPI_Count)buf_len, - bufType, &mpistatus)); + MPI_Offset len=buf_len; #else - TRACE_IO(MPI_File_write_at_all, (fh, 0, buf, (int)buf_len, - bufType, &mpistatus)); + int len=buf_len; #endif - } - else { -#ifdef HAVE_MPI_LARGE_COUNT - TRACE_IO(MPI_File_write_at_c, (fh, 0, buf, (MPI_Count)buf_len, - bufType, &mpistatus)); -#else - TRACE_IO(MPI_File_write_at, (fh, 0, buf, (int)buf_len, - bufType, &mpistatus)); -#endif - } + /* write buffer is contiguous */ + buf_view.size = buf_len; + buf_view.count = 1; + buf_view.off = &off; + buf_view.len = &len; + buf_view.is_contig = 1; + + /* write to variable collectively */ + if (nprocs > 1) + wlen = ncmpio_file_write_at_all(ncp, 0, buf, buf_view); + else + wlen = ncmpio_file_write_at(ncp, 0, buf, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + +// printf("%s at %d\n",__func__,__LINE__); NCI_Free(buf); - if (bufType != MPI_BYTE) MPI_Type_free(&bufType); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) status = err; - } + if (buf_view.type != MPI_BYTE) MPI_Type_free(&buf_view.type); + + if (blocklengths != NULL) NCI_Free(blocklengths); + if (offset != NULL) NCI_Free(offset); + + /* reset fileview to make the entire file visible */ + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + status = (status == NC_NOERR) ? err : status; - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", - MPI_INFO_NULL)); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) status = err; - } return status; } diff --git a/src/drivers/ncmpio/ncmpio_getput.m4 b/src/drivers/ncmpio/ncmpio_getput.m4 index 363701cac..9ceb80c24 100644 --- a/src/drivers/ncmpio/ncmpio_getput.m4 +++ b/src/drivers/ncmpio/ncmpio_getput.m4 @@ -44,20 +44,22 @@ dnl #include "ncmpio_subfile.h" #endif +#define ALWAYS_USE_INA + /* buffer layers: For write requests: buf (user buffer of internal data type) lbuf (contiguous buffer packed from buf based on buftype) cbuf (contiguous buffer packed from lbuf based on imap) - xbuf (contiguous buffer in external data type, type-casted/byte-swapped + xbuf (contiguous buffer in external data type, type-cast/byte-swapped from cbuf, ready to be used in MPI_File_write to write to file) For read requests: xbuf (contiguous buffer to be used in MPI_File_read to read from file. Its contents are in external data type) - cbuf (contiguous buffer type-casted/byte-swapped from xbuf, its contents - are in internal data type) + cbuf (contiguous buffer type-cast/byte-swapped from xbuf, its contents are + in internal data type) lbuf (contiguous buffer unpacked from cbuf based on imap) buf (user buffer, unpacked from lbuf based on buftype) @@ -118,10 +120,18 @@ put_varm(NC *ncp, void *xbuf=NULL; int mpireturn, err=NC_NOERR, status=NC_NOERR, buftype_is_contig; int el_size, need_convert=0, need_swap=0, need_swap_back_buf=0; - int coll_indep, xtype_is_contig=1, can_swap_in_place; - MPI_Offset nelems=0, bnelems=0, nbytes=0, offset=0; - MPI_Datatype itype, xtype=MPI_BYTE, imaptype, filetype=MPI_BYTE; - MPI_File fh; + int can_swap_in_place; + MPI_Offset nelems=0, bnelems=0, nbytes=0; + MPI_Datatype itype, imaptype; + + if (varp == NULL) { /* zero-sized request */ + itype = MPI_BYTE; + el_size = 0; + bnelems = 0; + nbytes = 0; + buftype_is_contig = 0; + goto err_check; + } /* decode buftype to obtain the followings: * itype: element data type (MPI primitive type) in buftype @@ -135,20 +145,10 @@ put_varm(NC *ncp, * el_size: byte size of itype * buftype_is_contig: whether buftype is contiguous */ - if (varp == NULL) { /* zero-sized request */ - itype = MPI_BYTE; - el_size = 0; - bnelems = 0; - nbytes = 0; - buftype_is_contig = 0; - } - else { - err = ncmpii_buftype_decode(varp->ndims, varp->xtype, count, bufcount, - buftype, &itype, &el_size, &bnelems, - &nbytes, &buftype_is_contig); - if (err != NC_NOERR) goto err_check; - } - xtype_is_contig = buftype_is_contig; + err = ncmpii_buftype_decode(varp->ndims, varp->xtype, count, bufcount, + buftype, &itype, &el_size, &bnelems, &nbytes, + &buftype_is_contig); + if (err != NC_NOERR) goto err_check; if (buftype == MPI_DATATYPE_NULL) { /* buftype and bufcount are ignored */ bufcount = bnelems; @@ -174,10 +174,15 @@ put_varm(NC *ncp, goto err_check; /* check if type conversion and Endianness byte swap is needed */ - if (varp != NULL) { /* non-zero-sized request */ - need_convert = ncmpii_need_convert(ncp->format, varp->xtype, itype); - need_swap = NEED_BYTE_SWAP(varp->xtype, itype); - } + need_convert = ncmpii_need_convert(ncp->format, varp->xtype, itype); + need_swap = NEED_BYTE_SWAP(varp->xtype, itype); + + /* check whether this is a true varm call, if yes, imaptype will be a + * newly created MPI derived data type, otherwise MPI_DATATYPE_NULL + */ + imaptype = MPI_DATATYPE_NULL; + err = ncmpii_create_imaptype(varp->ndims, count, imap, itype, &imaptype); + if (err != NC_NOERR) goto err_check; /* check if in-place byte swap can be enabled */ can_swap_in_place = 1; @@ -190,25 +195,23 @@ put_varm(NC *ncp, else if (! fIsSet(ncp->flags, NC_MODE_SWAP_ON)) { /* auto mode, as user does not explicitly enable it */ if (nbytes <= NC_BYTE_SWAP_BUFFER_SIZE) - /* If write amount is small, disable in-place swap. - * This is because the user buffer may be immutable. In this - * case, in-place swap will cause segmentation fault. Immutable - * buffers are usually small. */ + /* If write amount is small, disable in-place swap. This is + * because the user buffer may be immutable. In this case, + * in-place swap will cause segmentation fault. Immutable + * buffers are usually small. + */ can_swap_in_place = 0; } } - /* check whether this is a true varm call, if yes, imaptype will be a - * newly created MPI derived data type, otherwise MPI_DATATYPE_NULL - */ - imaptype = MPI_DATATYPE_NULL; - if (varp != NULL) { /* non-zero-sized request */ - err = ncmpii_create_imaptype(varp->ndims, count, imap, itype, &imaptype); - if (err != NC_NOERR) goto err_check; - } - +#ifdef ALWAYS_USE_INA + if (!need_convert && imaptype == MPI_DATATYPE_NULL && buftype_is_contig && + (!need_swap || can_swap_in_place)) +#else if (!need_convert && imaptype == MPI_DATATYPE_NULL && - (!need_swap || (can_swap_in_place && buftype_is_contig))) { + (!need_swap || (can_swap_in_place && buftype_is_contig))) +#endif + { /* reuse buftype, bufcount, buf in later MPI file write */ xbuf = buf; if (need_swap) { @@ -216,17 +219,17 @@ put_varm(NC *ncp, need_swap_back_buf = 1; } } - else if (varp != NULL) { + else { xbuf = NCI_Malloc((size_t)nbytes); if (xbuf == NULL) { DEBUG_ASSIGN_ERROR(err, NC_ENOMEM) goto err_check; } need_swap_back_buf = 0; - xtype_is_contig = 1; - /* pack buf to xbuf, byte-swap and type-convert on xbuf, which - * will later be used in MPI file write */ + /* Pack buf to xbuf, byte-swap and type-convert on xbuf, which will + * later be used in MPI file write. + */ err = ncmpio_pack_xbuf(ncp->format, varp, bufcount, buftype, buftype_is_contig, bnelems, itype, el_size, imaptype, need_convert, need_swap, nbytes, buf, @@ -238,16 +241,14 @@ put_varm(NC *ncp, } } - /* Set nelems and xtype which will be used in MPI read/write */ - if (buf != xbuf && varp != NULL) { + /* Set nelems which will be used in MPI read/write */ + if (buf != xbuf) { /* xbuf is a contiguous buffer */ - xtype = ncmpii_nc2mpitype(varp->xtype); nelems = bnelems; } else { /* we can safely use bufcount and buftype in MPI File read/write */ nelems = (bufcount == NC_COUNT_IGNORE) ? bnelems : bufcount; - xtype = buftype; } err_check: @@ -263,12 +264,22 @@ err_check: */ nbytes = 0; nelems = 0; - filetype = MPI_BYTE; - xtype = MPI_BYTE; } - if (fIsSet(reqMode, NC_REQ_COLL) && ncp->my_aggr >= 0 && ncp->nprocs > 1) { - /* intra-node write aggregation must be in collective mode */ +#ifdef ALWAYS_USE_INA + err = ncmpio_ina_req(ncp, NC_REQ_WR, varp, start, count, stride, nbytes, + xbuf); + if (status == NC_NOERR) status = err; +#else + MPI_Offset offset=0; + MPI_Datatype filetype=MPI_BYTE, xtype; + + /* Set xtype which will be used in MPI read/write */ + xtype = (nbytes == 0) ? MPI_BYTE + : (buf != xbuf) ? ncmpii_nc2mpitype(varp->xtype) : buftype; + + if (fIsSet(reqMode, NC_REQ_COLL) && ncp->num_aggrs_per_node > 0) { + /* intra-node aggregation must be in collective mode */ void *wbuf = (nbytes == 0) ? NULL : xbuf; err = ncmpio_intra_node_aggregation(ncp, NC_REQ_WR, varp, start, count, stride, nelems, xtype, wbuf); @@ -297,15 +308,8 @@ err_check: * at a time. */ - fh = ncp->independent_fh; - coll_indep = NC_REQ_INDEP; - if (ncp->nprocs > 1 && fIsSet(reqMode, NC_REQ_COLL)) { - fh = ncp->collective_fh; - coll_indep = NC_REQ_COLL; - } - /* MPI_File_set_view is collective */ - err = ncmpio_file_set_view(ncp, fh, &offset, filetype); + err = ncmpio_file_set_view(ncp, &offset, filetype, 0, NULL, NULL); if (err != NC_NOERR) { nelems = 0; /* skip this request */ if (status == NC_NOERR) status = err; @@ -316,10 +320,10 @@ err_check: * written to the variable defined in file. Note data stored in xbuf * is in the external data type, ready to be written to file. */ - err = ncmpio_read_write(ncp, NC_REQ_WR, coll_indep, offset, nelems, - xtype, xbuf, xtype_is_contig); + err = ncmpio_read_write(ncp, NC_REQ_WR, offset, nelems, xtype, xbuf); if (status == NC_NOERR) status = err; } +#endif /* done with xbuf */ if (xbuf != NULL && xbuf != buf) NCI_Free(xbuf); @@ -340,7 +344,8 @@ err_check: new_numrecs = start[0] + (count[0] - 1) * stride[0] + 1; /* note new_numrecs can be smaller than ncp->numrecs when this - * write request writes existing records */ + * write request writes existing records + */ } if (fIsSet(reqMode, NC_REQ_COLL)) { @@ -357,8 +362,9 @@ err_check: if (status == NC_NOERR) status = err; } } - /* In collective mode, ncp->numrecs is always sync-ed among - processes */ + /* In collective data mode, ncp->numrecs is always sync-ed among + * processes + */ if (ncp->numrecs < max_numrecs) { err = ncmpio_write_numrecs(ncp, max_numrecs); if (status == NC_NOERR) status = err; @@ -396,11 +402,19 @@ get_varm(NC *ncp, int reqMode) /* WR/RD/COLL/INDEP */ { void *xbuf=NULL; - int err=NC_NOERR, status=NC_NOERR, coll_indep, xtype_is_contig=1; + int err=NC_NOERR, status=NC_NOERR; int el_size, buftype_is_contig, need_swap=0, need_convert=0; - MPI_Offset nelems=0, bnelems=0, nbytes=0, offset=0; - MPI_Datatype itype, xtype=MPI_BYTE, filetype=MPI_BYTE, imaptype=MPI_DATATYPE_NULL; - MPI_File fh; + MPI_Offset nelems=0, bnelems=0, nbytes=0; + MPI_Datatype itype, imaptype=MPI_DATATYPE_NULL; + + if (varp == NULL) { /* zero-sized request */ + itype = MPI_BYTE; + el_size = 0; + bnelems = 0; + nbytes = 0; + buftype_is_contig = 0; + goto err_check; + } /* decode buftype to see if we can use buf to read from file. * itype: element data type (MPI primitive type) in buftype @@ -415,10 +429,9 @@ get_varm(NC *ncp, * buftype_is_contig: whether buftype is contiguous */ err = ncmpii_buftype_decode(varp->ndims, varp->xtype, count, bufcount, - buftype, &itype, &el_size, &bnelems, - &nbytes, &buftype_is_contig); + buftype, &itype, &el_size, &bnelems, &nbytes, + &buftype_is_contig); if (err != NC_NOERR) goto err_check; - xtype_is_contig = buftype_is_contig; if (buftype == MPI_DATATYPE_NULL) { /* buftype and bufcount are ignored */ bufcount = bnelems; @@ -461,32 +474,36 @@ get_varm(NC *ncp, * For condition 1, buftype is decoded in ncmpii_buftype_decode() * For condition 2, imap is checked in ncmpii_create_imaptype() */ +#ifdef ALWAYS_USE_INA + if (!need_convert && imaptype == MPI_DATATYPE_NULL && + !need_swap && buftype_is_contig) +#else if (!need_convert && imaptype == MPI_DATATYPE_NULL && - (!need_swap || buftype_is_contig)) { + (!need_swap || buftype_is_contig)) +#endif + { /* reuse buftype, bufcount, buf in later MPI file read */ xbuf = buf; } else { /* allocate xbuf for reading */ xbuf = NCI_Malloc((size_t)nbytes); - xtype_is_contig = 1; if (xbuf == NULL) { DEBUG_ASSIGN_ERROR(err, NC_ENOMEM) goto err_check; } } /* Note xbuf is the buffer to be used in MPI read calls, and hence its - * contents are in the external type */ + * contents are in the external type. + */ - /* Set nelems and xtype which will be used in MPI read/write */ + /* Set nelems which will be used in MPI read/write */ if (buf != xbuf) { /* xbuf is a contiguous buffer */ nelems = bnelems; - xtype = ncmpii_nc2mpitype(varp->xtype); } else { /* we can safely use bufcount and buftype in MPI File read/write */ nelems = (bufcount == NC_COUNT_IGNORE) ? bnelems : bufcount; - xtype = buftype; } err_check: @@ -496,58 +513,71 @@ err_check: /* for independent API, this process returns now */ if (fIsSet(reqMode, NC_REQ_INDEP)) return err; - /* for collective API, this process needs to participate the - * collective I/O operations, but with zero-length request + /* for collective API, this process needs to participate the collective + * I/O operations, but with zero-length request */ - filetype = MPI_BYTE; - xtype = MPI_BYTE; nbytes = 0; nelems = 0; } + +#ifdef ALWAYS_USE_INA + err = ncmpio_ina_req(ncp, NC_REQ_RD, varp, start, count, stride, nbytes, + xbuf); + if (status == NC_NOERR) status = err; +#else + MPI_Offset offset=0; + MPI_Datatype filetype=MPI_BYTE, xtype; + + /* Set xtype which will be used in MPI read/write */ + xtype = (nbytes == 0) ? MPI_BYTE + : (buf != xbuf) ? ncmpii_nc2mpitype(varp->xtype) : buftype; + + if (fIsSet(reqMode, NC_REQ_COLL) && ncp->num_aggrs_per_node > 0) { + /* intra-node aggregation must be in collective mode */ + void *rbuf = (nbytes == 0) ? NULL : xbuf; + err = ncmpio_intra_node_aggregation(ncp, NC_REQ_RD, varp, start, count, + stride, nelems, xtype, rbuf); + if (status == NC_NOERR) status = err; + } else { - /* Create the filetype for this request and calculate the beginning - * file offset for this request. If this request is contiguous in file, - * then set filetype == MPI_BYTE. Otherwise filetype will be an MPI - * derived data type. + if (nbytes > 0) { + /* Create the filetype for this request and calculate the beginning + * file offset for this request. If this request is contiguous in + * file, then set filetype == MPI_BYTE. Otherwise filetype will be + * an MPI derived data type. + */ + err = ncmpio_filetype_create_vars(ncp, varp, start, count, stride, + &offset, &filetype, NULL); + if (err != NC_NOERR) { + filetype = MPI_BYTE; + xtype = MPI_BYTE; + nbytes = 0; + nelems = 0; + if (status == NC_NOERR) status = err; + } + } + + /* TODO: if record variables are too big (so big that we cannot store + * the stride between records in an MPI_Aint, for example) then we will + * have to process this one record at a time. */ - err = ncmpio_filetype_create_vars(ncp, varp, start, count, stride, - &offset, &filetype, NULL); + + /* MPI_File_set_view is collective */ + err = ncmpio_file_set_view(ncp, &offset, filetype, 0, NULL, NULL); if (err != NC_NOERR) { - filetype = MPI_BYTE; - xtype = MPI_BYTE; - nbytes = 0; - nelems = 0; + nelems = 0; /* skip this request */ if (status == NC_NOERR) status = err; } - } - - /* TODO: if record variables are too big (so big that we cannot store the - * stride between records in an MPI_Aint, for example) then we will - * have to process this one record at a time. - */ - - fh = ncp->independent_fh; - coll_indep = NC_REQ_INDEP; - if (ncp->nprocs > 1 && fIsSet(reqMode, NC_REQ_COLL)) { - fh = ncp->collective_fh; - coll_indep = NC_REQ_COLL; - } + if (filetype != MPI_BYTE) MPI_Type_free(&filetype); - /* MPI_File_set_view is collective */ - err = ncmpio_file_set_view(ncp, fh, &offset, filetype); - if (err != NC_NOERR) { - nelems = 0; /* skip this request */ + /* xtype is the element data type (MPI primitive type) in xbuf to be + * read from the variable defined in file. Note xbuf will contain data + * read from the file and hence is in the external data type. + */ + err = ncmpio_read_write(ncp, NC_REQ_RD, offset, nelems, xtype, xbuf); if (status == NC_NOERR) status = err; } - if (filetype != MPI_BYTE) MPI_Type_free(&filetype); - - /* xtype is the element data type (MPI primitive type) in xbuf to be - * read from the variable defined in file. Note xbuf will contain data read - * from the file and hence is in the external data type. - */ - err = ncmpio_read_write(ncp, NC_REQ_RD, coll_indep, offset, nelems, xtype, - xbuf, xtype_is_contig); - if (status == NC_NOERR) status = err; +#endif if (nelems > 0) { /* unpack xbuf into user buffer, buf */ @@ -608,15 +638,22 @@ ncmpio_$1_var(void *ncdp, * write, they still need to participate the communication part of the * intra-node aggregation operation. */ - ifelse(`$1',`put',`if (ncp->my_aggr >= 0) - return $1_varm(ncp, NULL, NULL, NULL, NULL, imap, NULL, 0, buftype, reqMode);') +#ifdef ALWAYS_USE_INA + return $1_varm(ncp, NULL, NULL, NULL, NULL, imap, NULL, 0, + buftype, reqMode); +#else + if (ncp->num_aggrs_per_node > 0) + return $1_varm(ncp, NULL, NULL, NULL, NULL, imap, NULL, 0, + buftype, reqMode); /* this collective API has a zero-length request */ return ncmpio_getput_zero_req(ncp, reqMode); +#endif } /* obtain NC_var object pointer, varp. Note sanity check for ncdp and - * varid has been done in dispatchers */ + * varid has been done in dispatchers + */ varp = ncp->vars.value[varid]; #ifdef ENABLE_SUBFILING diff --git a/src/drivers/ncmpio/ncmpio_header_get.c b/src/drivers/ncmpio/ncmpio_header_get.c index 6ddd89bc1..b1ae28cc4 100644 --- a/src/drivers/ncmpio/ncmpio_header_get.c +++ b/src/drivers/ncmpio/ncmpio_header_get.c @@ -316,103 +316,101 @@ hdr_len_NC_vararray(const NC_vararray *ncap, /*----< hdr_fetch() >--------------------------------------------------------*/ /* Fetch the next header chunk. The chunk buffer, pointed by gbp->base, is of - * size 'gbp->chunk' bytes. Be careful not to overwrite leftover (yet to be - * used) data in the buffer before fetching a new chunk. + * size 'gbp->ncp->chunk' bytes. Be careful not to overwrite leftover (yet to + * be used) data in the buffer before fetching a new chunk. */ static int hdr_fetch(bufferinfo *gbp) { - char *mpi_name; int rank, nprocs, err=NC_NOERR, mpireturn; - MPI_Status mpistatus; + PNCIO_View buf_view; assert(gbp->base != NULL); - MPI_Comm_size(gbp->comm, &nprocs); - MPI_Comm_rank(gbp->comm, &rank); + buf_view.count = 0; + buf_view.off = NULL; + buf_view.len = NULL; + buf_view.is_contig = 1; + buf_view.type = MPI_BYTE; + + MPI_Comm_size(gbp->ncp->comm, &nprocs); + MPI_Comm_rank(gbp->ncp->comm, &rank); if (rank == 0) { char *readBuf; int readLen; size_t slack; + MPI_Offset rlen; /* any leftover data in the buffer */ - slack = gbp->chunk - (gbp->pos - gbp->base); - if (slack == gbp->chunk) slack = 0; + slack = gbp->ncp->chunk - (gbp->pos - gbp->base); + if (slack == gbp->ncp->chunk) slack = 0; - /* When gbp->chunk == (gbp->pos - gbp->base), all data in the buffer has - * been consumed. If not, then read additional header of size - * (gbp->chunk - slack) into a contiguous buffer, pointed by gbp->base + - * slack. + /* When gbp->ncp->chunk == (gbp->pos - gbp->base), all data in the + * buffer has been consumed. If not, then read additional header of + * size (gbp->ncp->chunk - slack) into a contiguous buffer, pointed by + * gbp->base + slack. */ readBuf = gbp->base; - readLen = gbp->chunk; + readLen = gbp->ncp->chunk; if (slack > 0) { /* move slack to beginning of the buffer, gbp->base */ memmove(gbp->base, gbp->pos, slack); readBuf += slack; readLen -= slack; } - /* explicitly initialize mpistatus object to 0. For zero-length read, - * MPI_Get_count may report incorrect result for some MPICH version, - * due to the uninitialized MPI_Status object passed to MPI-IO calls. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); + buf_view.size = readLen; /* fileview is already entire file visible and MPI_File_read_at does not change the file pointer */ - if (gbp->coll_mode == 1) { /* collective read */ - TRACE_IO(MPI_File_read_at_all, (gbp->collective_fh, gbp->offset, readBuf, - readLen, MPI_BYTE, &mpistatus)); - } - else { - TRACE_IO(MPI_File_read_at, (gbp->collective_fh, gbp->offset, readBuf, - readLen, MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD) - } - else { - /* Obtain the actual read amount. It may be smaller than readLen, - * when the remaining file size is smaller than read chunk size. - * Because each MPI File_read reads amount of readLen bytes, and - * readLen <= read chunk size which is <= NC_MAX_INT, calling - * MPI_Get_count() is sufficient. No need to call MPI_Get_count_c() - */ - int get_size; - MPI_Get_count(&mpistatus, MPI_BYTE, &get_size); - gbp->get_size += get_size; - - /* If actual read amount is shorter than readLen, then we zero-out - * the remaining buffer. This is because the MPI_Bcast below - * broadcasts a buffer of a fixed size, gbp->chunk. Without zeroing - * out, valgrind will complain about the uninitialized values. + if (gbp->ncp->nprocs > 1 && fIsSet(gbp->ncp->flags, NC_HCOLL)) + /* collective read */ + rlen = ncmpio_file_read_at_all(gbp->ncp, gbp->offset, readBuf, + buf_view); + else + /* independent read */ + rlen = ncmpio_file_read_at(gbp->ncp, gbp->offset, readBuf, + buf_view); + + if (rlen > 0) { + /* rlen is the actual read amount. It may be smaller than readLen, + * when the remaining file size is smaller than readLen. When + * actual read amount is smaller than readLen, then we zero-out the + * remaining buffer. This is because the MPI_Bcast below broadcasts + * a buffer of a fixed size, gbp->ncp->chunk. Without zeroing out, + * valgrind will complain about the uninitialized values. */ - if (get_size < readLen) - memset(readBuf + get_size, 0, readLen - get_size); + if (rlen < readLen) + memset(readBuf + rlen, 0, readLen - rlen); } + else if (rlen < 0) + err = (int)rlen; + /* only root process reads file header, keeps track of current read * file pointer location */ - gbp->offset += readLen; + gbp->offset += rlen; } - else if (gbp->coll_mode == 1) { /* collective read */ - /* other processes participate the collective call */ - TRACE_IO(MPI_File_read_at_all, (gbp->collective_fh, 0, NULL, - 0, MPI_BYTE, &mpistatus)); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD) - } + else if (gbp->ncp->nprocs > 1 && fIsSet(gbp->ncp->flags, NC_HCOLL)) { + /* Collective read: non-root ranks participate the collective call with + * a zero-sized request. + */ + buf_view.size = 0; + ncmpio_file_read_at_all(gbp->ncp, 0, NULL, buf_view); } - if (gbp->safe_mode == 1 && nprocs > 1) { - TRACE_COMM(MPI_Bcast)(&err, 1, MPI_INT, 0, gbp->comm); + if (gbp->ncp->safe_mode == 1 && nprocs > 1) { + TRACE_COMM(MPI_Bcast)(&err, 1, MPI_INT, 0, gbp->ncp->comm); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast"); if (err != NC_NOERR) return err; } /* broadcast root's read (full or partial header) to other processes */ - if (nprocs > 1) - TRACE_COMM(MPI_Bcast)(gbp->base, gbp->chunk, MPI_BYTE, 0, gbp->comm); + if (nprocs > 1) { + TRACE_COMM(MPI_Bcast)(gbp->base, gbp->ncp->chunk, MPI_BYTE, 0, + gbp->ncp->comm); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast"); + } gbp->pos = gbp->base; @@ -503,7 +501,7 @@ hdr_get_nc_type(bufferinfo *gbp, nc_type *xtypep) if (xtype < NC_BYTE) DEBUG_RETURN_ERROR(NC_EBADTYPE) - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { if (xtype > NC_DOUBLE) DEBUG_RETURN_ERROR(NC_EBADTYPE) } @@ -536,7 +534,7 @@ hdr_get_NC_name(bufferinfo *gbp, char **namep, size_t *name_len) *namep = NULL; /* get nelems (string length of name) */ - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) return err; @@ -564,7 +562,7 @@ hdr_get_NC_name(bufferinfo *gbp, char **namep, size_t *name_len) */ padding = PNETCDF_RNDUP(nchars, X_ALIGN) - nchars; - bufremain = gbp->chunk - (gbp->pos - gbp->base); + bufremain = gbp->ncp->chunk - (gbp->pos - gbp->base); cpos = *namep; @@ -585,7 +583,7 @@ hdr_get_NC_name(bufferinfo *gbp, char **namep, size_t *name_len) *namep = NULL; return err; } - bufremain = gbp->chunk; + bufremain = gbp->ncp->chunk; } } @@ -659,7 +657,7 @@ hdr_get_NC_dim(bufferinfo *gbp, int unlimited_id, NC_dim **dimpp) else if (err != NC_NOERR) return err; /* get dim_length */ - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); dim_length = (MPI_Offset)tmp; @@ -730,7 +728,7 @@ hdr_get_NC_dimarray(bufferinfo *gbp, NC_dimarray *ncap) if (err != NC_NOERR) return err; /* read nelems (number of dimensions) from gbp buffer */ - if (gbp->version < 5) { /* nelems is */ + if (gbp->ncp->format < 5) { /* nelems is */ uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) return err; @@ -809,8 +807,8 @@ hdr_get_NC_attrV(bufferinfo *gbp, NC_attr *attrp) nbytes = attrp->nelems * xsz; padding = attrp->xsz - nbytes; - bufremain = gbp->chunk - (gbp->pos - gbp->base); - /* gbp->chunk is the read chunk size, which is of type 4-byte int. + bufremain = gbp->ncp->chunk - (gbp->pos - gbp->base); + /* gbp->ncp->chunk is the read chunk size, which is of type 4-byte int. * thus bufremain should be less than INT_MAX */ /* get values */ @@ -823,10 +821,9 @@ hdr_get_NC_attrV(bufferinfo *gbp, NC_attr *attrp) value = (void *)((char *)value + attcount); bufremain -= attcount; } else { - int err; err = hdr_fetch(gbp); if (err != NC_NOERR) return err; - bufremain = gbp->chunk; + bufremain = gbp->ncp->chunk; } } @@ -906,7 +903,7 @@ hdr_get_NC_attr(bufferinfo *gbp, NC_attr **attrpp) } /* get nelems */ - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); nelems = (MPI_Offset)tmp; @@ -977,7 +974,7 @@ hdr_get_NC_attrarray(bufferinfo *gbp, NC_attrarray *ncap) if (err != NC_NOERR) return err; /* read nelems (number of attributes) from gbp buffer */ - if (gbp->version < 5) { /* nelems is */ + if (gbp->ncp->format < 5) { /* nelems is */ uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) return err; @@ -1061,7 +1058,7 @@ hdr_get_NC_var(bufferinfo *gbp, else if (err != NC_NOERR) return err; /* nelems (number of dimensions) */ - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) { @@ -1099,7 +1096,7 @@ hdr_get_NC_var(bufferinfo *gbp, /* get [dimid ...] */ for (dim=0; dimversion < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) break; @@ -1135,7 +1132,7 @@ hdr_get_NC_var(bufferinfo *gbp, ncmpii_xlen_nc_type(varp->xtype, &varp->xsz); /* get vsize */ - if (gbp->version < 5) { + if (gbp->ncp->format < 5) { uint tmp; err = hdr_get_uint32(gbp, &tmp); varp->len = (MPI_Offset)tmp; @@ -1164,7 +1161,7 @@ hdr_get_NC_var(bufferinfo *gbp, */ /* get begin */ - if (gbp->version == 1) { + if (gbp->ncp->format == 1) { uint tmp; err = hdr_get_uint32(gbp, &tmp); varp->begin = (MPI_Offset)tmp; @@ -1223,7 +1220,7 @@ hdr_get_NC_vararray(bufferinfo *gbp, if (err != NC_NOERR) return err; /* read nelems (number of variables) from gbp buffer */ - if (gbp->version < 5) { /* nelems is */ + if (gbp->ncp->format < 5) { /* nelems is */ uint tmp; err = hdr_get_uint32(gbp, &tmp); if (err != NC_NOERR) return err; @@ -1339,24 +1336,13 @@ ncmpio_hdr_get_NC(NC *ncp) assert(ncp != NULL); /* Initialize the get buffer that stores the header read from the file */ - getbuf.comm = ncp->comm; - getbuf.collective_fh = ncp->collective_fh; - getbuf.get_size = 0; - getbuf.offset = 0; /* read from start of the file */ - getbuf.safe_mode = ncp->safe_mode; - if (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)) - getbuf.coll_mode = 1; - else - getbuf.coll_mode = 0; - - /* CDF-5's minimum header size is 4 bytes more than CDF-1 and CDF-2's */ - getbuf.chunk = PNETCDF_RNDUP( MAX(MIN_NC_XSZ+4, ncp->chunk), X_ALIGN ); + getbuf.ncp = ncp; + getbuf.offset = 0; /* read from start of the file */ + getbuf.base = (char*) NCI_Malloc(getbuf.ncp->chunk); + getbuf.pos = getbuf.base; + getbuf.end = getbuf.base + getbuf.ncp->chunk; - getbuf.base = (char*) NCI_Malloc(getbuf.chunk); - getbuf.pos = getbuf.base; - getbuf.end = getbuf.base + getbuf.chunk; - - /* Fetch the next header chunk. The chunk is 'gbp->chunk' bytes big */ + /* Fetch the next header chunk. The chunk is 'gbp->ncp->chunk' bytes big */ err = hdr_fetch(&getbuf); if (err != NC_NOERR) return err; @@ -1381,20 +1367,20 @@ ncmpio_hdr_get_NC(NC *ncp) goto fn_exit; } - /* check version number in last byte of magic */ - if (magic[3] == 0x1) { - getbuf.version = ncp->format = 1; - } else if (magic[3] == 0x2) { - getbuf.version = ncp->format = 2; - } else if (magic[3] == 0x5) { - getbuf.version = ncp->format = 5; - } else { + /* check format version number in last byte of magic */ + if (magic[3] == 0x1) + ncp->format = 1; + else if (magic[3] == 0x2) + ncp->format = 2; + else if (magic[3] == 0x5) + ncp->format = 5; + else { NCI_Free(getbuf.base); DEBUG_RETURN_ERROR(NC_ENOTNC) /* not a netCDF file */ } /* get numrecs from getbuf into ncp */ - if (getbuf.version < 5) { + if (getbuf.ncp->format < 5) { uint tmp=0; err = hdr_get_uint32(&getbuf, &tmp); if (err != NC_NOERR) goto fn_exit; @@ -1449,7 +1435,6 @@ ncmpio_hdr_get_NC(NC *ncp) if (err != NC_NOERR) goto fn_exit; fn_exit: - ncp->get_size += getbuf.get_size; NCI_Free(getbuf.base); return (err == NC_NOERR) ? status : err; diff --git a/src/drivers/ncmpio/ncmpio_header_put.c b/src/drivers/ncmpio/ncmpio_header_put.c index 8daf88c67..387e9cfba 100644 --- a/src/drivers/ncmpio/ncmpio_header_put.c +++ b/src/drivers/ncmpio/ncmpio_header_put.c @@ -49,7 +49,7 @@ hdr_put_NC_name(bufferinfo *pbp, size_t nchars = strlen(name); /* copy nelems */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) err = ncmpix_put_uint32((void**)(&pbp->pos), (uint)nchars); else err = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)nchars); @@ -78,7 +78,7 @@ hdr_put_NC_dim(bufferinfo *pbp, if (err != NC_NOERR) return err; /* copy dim_length */ - if (pbp->version < 5) { + if (pbp->ncp->format < 5) { /* TODO: Isn't checking dimension size already done in def_dim()? */ if (dimp->size > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) @@ -116,7 +116,7 @@ hdr_put_NC_dimarray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* put a ZERO or ZERO64 depending on which CDF format */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), 0); else status = ncmpix_put_uint64((void**)(&pbp->pos), 0); @@ -128,7 +128,7 @@ hdr_put_NC_dimarray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* copy nelems */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)ncap->ndefined); else status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)ncap->ndefined); @@ -175,7 +175,7 @@ hdr_put_NC_attrV(bufferinfo *pbp, sz = attrp->nelems * xsz; padding = attrp->xsz - sz; - if (pbp->version < 5 && sz > NC_MAX_INT) + if (pbp->ncp->format < 5 && sz > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) memcpy(pbp->pos, attrp->xvalue, (size_t)sz); @@ -214,7 +214,7 @@ hdr_put_NC_attr(bufferinfo *pbp, if (status != NC_NOERR) return status; /* copy nelems */ - if (pbp->version < 5) { + if (pbp->ncp->format < 5) { if (attrp->nelems > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)attrp->nelems); @@ -258,7 +258,7 @@ hdr_put_NC_attrarray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* put a ZERO or ZERO64 depending on which CDF format */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), 0); else status = ncmpix_put_uint64((void**)(&pbp->pos), 0); @@ -270,7 +270,7 @@ hdr_put_NC_attrarray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* copy nelems */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)ncap->ndefined); else status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)ncap->ndefined); @@ -314,7 +314,7 @@ hdr_put_NC_var(bufferinfo *pbp, if (status != NC_NOERR) return status; /* copy nelems */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)varp->ndims); else status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)varp->ndims); @@ -322,7 +322,7 @@ hdr_put_NC_var(bufferinfo *pbp, /* copy [dimid ...] */ for (i=0; indims; i++) { - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)varp->dimids[i]); else status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)varp->dimids[i]); @@ -341,7 +341,7 @@ hdr_put_NC_var(bufferinfo *pbp, /* in CDF-1 and CDF-2, a variable's size in the header is a 32-bit integer * in CDF-5, it is a 64-bit integer */ - if (pbp->version < 5) { + if (pbp->ncp->format < 5) { /* Special case, when there is no record variable, the last fixed-size * variable can be larger than 2 GiB if its file starting offset is * less than 2 GiB. This checking has already been done in the call @@ -367,7 +367,7 @@ hdr_put_NC_var(bufferinfo *pbp, /* in CDF-1 header, a variable's starting file offset is a 32-bit integer * in CDF-2 and CDF-5, it is a 64-bit integer */ - if (pbp->version == 1) { + if (pbp->ncp->format == 1) { if (varp->begin > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)varp->begin); @@ -407,7 +407,7 @@ hdr_put_NC_vararray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* put a ZERO or ZERO64 depending on which CDF format */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), 0); else status = ncmpix_put_uint64((void**)(&pbp->pos), 0); @@ -419,7 +419,7 @@ hdr_put_NC_vararray(bufferinfo *pbp, if (status != NC_NOERR) return status; /* copy nelems */ - if (pbp->version < 5) + if (pbp->ncp->format < 5) status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)ncap->ndefined); else status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)ncap->ndefined); @@ -441,20 +441,14 @@ hdr_put_NC_vararray(bufferinfo *pbp, int ncmpio_hdr_put_NC(NC *ncp, void *buf) { - int status; + int err; bufferinfo putbuf; MPI_Offset nrecs=0; - putbuf.comm = ncp->comm; - putbuf.collective_fh = ncp->collective_fh; - putbuf.offset = 0; - putbuf.pos = buf; - putbuf.base = buf; - putbuf.safe_mode = ncp->safe_mode; - if (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)) - putbuf.coll_mode = 1; - else - putbuf.coll_mode = 0; + putbuf.ncp = ncp; + putbuf.offset = 0; + putbuf.pos = buf; + putbuf.base = buf; /* netCDF file format: * netcdf_file = header data @@ -462,43 +456,37 @@ ncmpio_hdr_put_NC(NC *ncp, void *buf) */ /* copy "magic", 4 characters */ - if (ncp->format == 5) { - putbuf.version = 5; - status = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic5), ncmagic5); - } - else if (ncp->format == 2) { - putbuf.version = 2; - status = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic2), ncmagic2); - } - else { - putbuf.version = 1; - status = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic1), ncmagic1); - } - if (status != NC_NOERR) return status; + if (ncp->format == 5) + err = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic5), ncmagic5); + else if (ncp->format == 2) + err = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic2), ncmagic2); + else + err = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic1), ncmagic1); + if (err != NC_NOERR) return err; /* copy numrecs, number of records */ nrecs = ncp->numrecs; if (ncp->format < 5) { if (nrecs > NC_MAX_INT) DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) - status = ncmpix_put_uint32((void**)(&putbuf.pos), (uint)nrecs); + err = ncmpix_put_uint32((void**)(&putbuf.pos), (uint)nrecs); } else { - status = ncmpix_put_uint64((void**)(&putbuf.pos), (uint64)nrecs); + err = ncmpix_put_uint64((void**)(&putbuf.pos), (uint64)nrecs); } - if (status != NC_NOERR) return status; + if (err != NC_NOERR) return err; /* copy dim_list */ - status = hdr_put_NC_dimarray(&putbuf, &ncp->dims); - if (status != NC_NOERR) return status; + err = hdr_put_NC_dimarray(&putbuf, &ncp->dims); + if (err != NC_NOERR) return err; /* copy gatt_list */ - status = hdr_put_NC_attrarray(&putbuf, &ncp->attrs); - if (status != NC_NOERR) return status; + err = hdr_put_NC_attrarray(&putbuf, &ncp->attrs); + if (err != NC_NOERR) return err; /* copy var_list */ - status = hdr_put_NC_vararray(&putbuf, &ncp->vars); - if (status != NC_NOERR) return status; + err = hdr_put_NC_vararray(&putbuf, &ncp->vars); + if (err != NC_NOERR) return err; return NC_NOERR; } @@ -514,11 +502,12 @@ ncmpio_hdr_put_NC(NC *ncp, void *buf) */ int ncmpio_write_header(NC *ncp) { - char *mpi_name; - int status=NC_NOERR, mpireturn, err; + int status=NC_NOERR, mpireturn; size_t i, ntimes; - MPI_File fh; - MPI_Status mpistatus; + PNCIO_View buf_view; + + buf_view.count = 1; + buf_view.is_contig = 1; /* Write the entire header to the file. This function may be called from * a rename API. In that case, we cannot just change the variable name in @@ -526,10 +515,6 @@ int ncmpio_write_header(NC *ncp) * all metadata following the new name must be moved ahead. */ - fh = ncp->collective_fh; - if (NC_indep(ncp)) /* called in independent data mode */ - fh = ncp->independent_fh; - /* update file header size, as this subroutine may be called from a rename * API (var or attribute) and the new name is smaller/bigger which changes * the header size. We recalculate ncp->xsz by getting the un-aligned size @@ -555,42 +540,17 @@ int ncmpio_write_header(NC *ncp) buf_ptr = buf; for (i=0; iflags, NC_HCOLL)) /* header collective write */ + wlen = ncmpio_file_write_at_all(ncp, offset, buf_ptr, buf_view); + else /* header independent write */ + wlen = ncmpio_file_write_at(ncp, offset, buf_ptr, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; - /* explicitly initialize mpistatus object to 0. For zero-length - * read, MPI_Get_count may report incorrect result for some MPICH - * version, due to the uninitialized MPI_Status object passed to - * MPI-IO calls. Thus we initialize it above to work around. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); - - if (fIsSet(ncp->flags, NC_HCOLL)) { /* header collective write */ - TRACE_IO(MPI_File_write_at_all, (fh, offset, buf_ptr, writeLen, - MPI_BYTE, &mpistatus)); - } - else { /* header independent write */ - TRACE_IO(MPI_File_write_at, (fh, offset, buf_ptr, writeLen, - MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (status == NC_NOERR) { - err = (err == NC_EFILE) ? NC_EWRITE : err; - DEBUG_ASSIGN_ERROR(status, err) - } - } - else { - /* update the number of bytes written since file open. - * Because each MPI write writes no more than NC_MAX_INT, - * calling MPI_Get_count() is sufficient. No need to call - * MPI_Get_count_c() - */ - int put_size; - mpireturn = MPI_Get_count(&mpistatus, MPI_BYTE, &put_size); - if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED) - ncp->put_size += ncp->xsz; - else - ncp->put_size += writeLen; - } offset += writeLen; buf_ptr += writeLen; remain -= writeLen; @@ -598,10 +558,9 @@ int ncmpio_write_header(NC *ncp) NCI_Free(buf); } else if (fIsSet(ncp->flags, NC_HCOLL)) { /* header collective write */ - /* collective write: other processes participate the collective call */ - for (i=0; isafe_mode == 1) { diff --git a/src/drivers/ncmpio/ncmpio_i_getput.m4 b/src/drivers/ncmpio/ncmpio_i_getput.m4 index 7f624207d..b5b4f09db 100644 --- a/src/drivers/ncmpio/ncmpio_i_getput.m4 +++ b/src/drivers/ncmpio/ncmpio_i_getput.m4 @@ -122,6 +122,11 @@ ncmpio_add_record_requests(NC_lead_req *lead_list, reqs[i].lead_off = reqs[0].lead_off; reqs[i].xbuf = xbuf; xbuf += rec_bufsize; + + /* copy the number of flattened offset-length pairs */ + reqs[i].npairs = reqs[0].npairs; + reqs[i].offset_start = reqs[0].offset_start; + reqs[i].offset_end = reqs[0].offset_end; } return NC_NOERR; @@ -142,7 +147,7 @@ ncmpio_igetput_varm(NC *ncp, int reqMode) { void *xbuf=NULL; - int i, err=NC_NOERR, abuf_index=-1, isize, xsize, new_nreqs, rem; + int i, j, err=NC_NOERR, abuf_index=-1, isize, xsize, new_nreqs, rem; int mpireturn, buftype_is_contig=1, need_convert, free_xbuf=0; int need_swap, can_swap_in_place, need_swap_back_buf=0; MPI_Offset nelems=0, nbytes, *ptr; @@ -520,9 +525,13 @@ ncmpio_igetput_varm(NC *ncp, } /* allocate a single array for non-leads to store start/count/stride */ + req->npairs = 0; if (varp->ndims == 0) { /* scalar variable, start may be NULL */ lead_req->start = NULL; req->start = NULL; + req->npairs = 1; + req->offset_start = 0; /* relative to var's begin */ + req->offset_end = varp->xsz; } else if (stride == NULL) { size_t memChunk = varp->ndims * SIZEOF_MPI_OFFSET; @@ -536,6 +545,12 @@ ncmpio_igetput_varm(NC *ncp, memcpy(ptr, start, memChunk); ptr += varp->ndims; memcpy(ptr, count, memChunk); + + /* calculate number of flattened offset-length pairs */ + req->npairs = 1; + j = IS_RECVAR(varp) ? 1 : 0; + for (i=j; indims-1; i++) + req->npairs *= count[i]; } else { size_t memChunk = varp->ndims * SIZEOF_MPI_OFFSET; @@ -551,12 +566,24 @@ ncmpio_igetput_varm(NC *ncp, memcpy(ptr, count, memChunk); ptr += varp->ndims; memcpy(ptr, stride, memChunk); + + /* calculate number of flattened offset-length pairs */ + req->npairs = (stride[varp->ndims-1] == 1) ? 1 : count[varp->ndims-1]; + j = IS_RECVAR(varp) ? 1 : 0; + for (i=j; indims-1; i++) + req->npairs *= count[i]; } /* set the properties of non-lead request */ req->xbuf = xbuf; req->nelems = nelems; + /* special treatment when there is only one offset-length pair */ + if (req->npairs == 1 && varp->ndims > 0) { + ncmpio_calc_off(ncp, varp, start, &req->offset_start); + req->offset_end = req->nelems * varp->xsz; + } + if (IS_RECVAR(varp)) { /* save the last record number accessed */ if (stride == NULL) @@ -576,6 +603,8 @@ ncmpio_igetput_varm(NC *ncp, : ncp->get_lead_list; req->nelems /= count[0]; + if (req->npairs == 1) + req->offset_end = req->nelems * varp->xsz; /* add (count[0]-1) number of (sub)requests */ ncmpio_add_record_requests(lead_list, req, count[0], stride); diff --git a/src/drivers/ncmpio/ncmpio_i_varn.m4 b/src/drivers/ncmpio/ncmpio_i_varn.m4 index be9af9752..8bad268f0 100644 --- a/src/drivers/ncmpio/ncmpio_i_varn.m4 +++ b/src/drivers/ncmpio/ncmpio_i_varn.m4 @@ -452,6 +452,12 @@ igetput_varn(NC *ncp, lead_req->max_rec = -1; lead_req->nonlead_num = new_nreqs; +#if 0 +MPI_Aint addr; +MPI_Get_address(lead_req->xbuf, &addr); +printf("%s at %d: lead_req xbuf=%ld nelems=%lld\n",__func__,__LINE__, addr,lead_req->nelems); +#endif + /* varn APIs have no argument stride */ fSet(lead_req->flag, NC_REQ_STRIDE_NULL); @@ -466,6 +472,8 @@ igetput_varn(NC *ncp, xbufp = (char*)xbuf; for (i=0; inpairs = 0; + if (req_nelems[i] == 0) continue; /* ignore this 0-length request i */ req->nelems = req_nelems[i]; @@ -473,11 +481,17 @@ igetput_varn(NC *ncp, req->xbuf = xbufp; xbufp += req_nelems[i] * xsize; +#if 0 +MPI_Get_address(req->xbuf, &addr); +printf("%s at %d: req i=%d xbuf=%ld off=%ld nelems=%lld\n",__func__,__LINE__, i,addr,(char*)req->xbuf - (char*)xbuf,req->nelems); +#endif + /* copy starts[i] and counts[i] over to req */ req->start = start_ptr; memcpy(start_ptr, starts[i], memChunk); start_ptr += varp->ndims; /* count[] */ if (counts == NULL || counts[i] == NULL) { + /* counts == NULL, equivalent to all 1s */ for (j=0; jndims; j++) start_ptr[j] = 1; /* start_ptr is now counts[] */ } @@ -492,6 +506,24 @@ igetput_varn(NC *ncp, if (counts == NULL || counts[i] == NULL) num_rec = 1; else num_rec = counts[i][0]; + /* calculate number of flattened offset-length pairs */ + req->npairs = 1; + if (counts == NULL || counts[i] == NULL) { + /* equivalent to all multiple var1 APIs */ + ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start); + // req->offset_end = req->offset_start + varp->xsz; + req->offset_end = varp->xsz; + } + else { + for (j=1; jndims-1; j++) + req->npairs *= counts[i][j]; + /* special treatment for when there is only one pair */ + if (req->npairs == 1) { + ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start); + req->offset_end = req->nelems * varp->xsz; + } + } + max_rec = starts[i][0] + num_rec; lead_req->max_rec = MAX(lead_req->max_rec, max_rec); @@ -506,6 +538,11 @@ igetput_varn(NC *ncp, lead_list = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list : ncp->get_lead_list; + + req->nelems /= counts[i][0]; + if (req->npairs == 1) + req->offset_end = req->nelems * varp->xsz; + /* append (counts[i][0]-1) number of requests to the queue */ ncmpio_add_record_requests(lead_list, req, counts[i][0], NULL); start_ptr += (counts[i][0] - 1) * 2 * varp->ndims; @@ -514,8 +551,26 @@ igetput_varn(NC *ncp, else req++; } - else + else { + /* calculate number of flattened offset-length pairs */ + req->npairs = 1; + if (counts == NULL || counts[i] == NULL) { + /* equivalent to all multiple var1 APIs */ + ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start); + // req->offset_end = req->offset_start + varp->xsz; + req->offset_end = varp->xsz; + } + else { + for (j=0; jndims-1; j++) + req->npairs *= counts[i][j]; + /* special treatment for when there is only one pair */ + if (req->npairs == 1) { + ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start); + req->offset_end = req->nelems * varp->xsz; + } + } req++; + } } if (reqid != NULL) *reqid = lead_req->id; diff --git a/src/drivers/ncmpio/ncmpio_intra_node.c b/src/drivers/ncmpio/ncmpio_intra_node.c index 90d613eb4..38b8c38dd 100644 --- a/src/drivers/ncmpio/ncmpio_intra_node.c +++ b/src/drivers/ncmpio/ncmpio_intra_node.c @@ -3,31 +3,40 @@ * See COPYRIGHT notice in top-level directory. * * This file contains the implementation of intra-node aggregation feature, - * which is designed for the I/O patterns that contain many noncontiguous - * requests interleaved among processes, and spreading across a wide range of - * file space. It is particularly useful when the number of MPI processes - * allocated to a compute node is large. + * which is designed to improve performance for I/O patterns that contain many + * noncontiguous requests interleaved among processes, with a wide aggregate + * access region on each process that involves file stripes responsible by + * almost all the file servers. By reducing the number of processes per node + * to participate MPI-IO operations, this feature can effectively reduce the + * communication contention, particularly often happened to jobs that run a + * large the number of MPI processes per compute node. * - * This feature is enabled by setting the PnetCDF hint 'nc_num_aggrs_per_node' - * to a positive integral value indicating the desired number of processes per - * compute node to be selected as the intra-node I/O aggregators. Each process - * is assigned a unique aggregator. The non-aggregators send their requests to - * the assigned aggregators, and then the aggregators make MPI-IO requests to - * the file. + * Users can enable this feature by setting the PnetCDF I/O hint named + * 'nc_num_aggrs_per_node' to a positive integral value, indicating the desired + * number of processes per compute node to be selected as the intra-node I/O + * aggregators. Processes running on the same node are divided into groups. + * The process with the lowest rank ID is selected as the I/O aggregator of + * that group. Non-aggregators send their requests to their aggregators, and + * then the aggregators make I/O requests to the file, i.e. only aggregators + * make MPI-IO calls. * - * Such strategy can effectively reduce communication congestion due to many - * pending asynchronous messages produced in the collective write inside of - * MPI-IO. + * Because communication within a node can be achieved by memory copy operation + * and thus its cost is expected to be much lower than the inter-node + * communication, this feature can effectively reduce the communication + * congestion or exhaustion of message queues, due to many pending asynchronous + * messages produced in the two-phase I/O, the strategy used to implement + * MPI collective I/O. * - * The concept of intra-node request aggregation is based on the paper: + * The concept of intra-node request aggregation and its performance results + * are presented in the following paper. * Q. Kang, S. Lee, K. Hou, R. Ross, A. Agrawal, A. Choudhary, and W. Liao. * Improving MPI Collective I/O for High Volume Non-Contiguous Requests With * Intra-Node Aggregation. IEEE Transactions on Parallel and Distributed - * Systems (TPDS), 31(11):2682-2695, November 2020. + * Systems, 31(11):2682-2695, November 2020. */ #ifdef HAVE_CONFIG_H -# include +#include #endif #include @@ -41,28 +50,34 @@ #include #include "ncmpio_NC.h" +/* swap values of x and y */ +#define SWAP1(x, y, tmp) { tmp = x ; x = y; y = tmp ; } + #ifdef HAVE_MPI_LARGE_COUNT +/* swap elements of arrays x, y, and corresponding lengths and bufAddr */ #define SWAP(offsets, lengths, bufAddr, x, y) { \ MPI_Count aint; \ MPI_Count cint; \ MPI_Count d0 = (x) - offsets; \ MPI_Count d1 = (y) - offsets; \ if (d0 != d1) { \ - cint = *(x) ; *(x) = *(y) ; *(y) = cint ; \ - cint = lengths[d0] ; lengths[d0] = lengths[d1] ; lengths[d1] = cint ; \ - aint = bufAddr[d0] ; bufAddr[d0] = bufAddr[d1] ; bufAddr[d1] = aint ; \ + SWAP1(*(x), *(y), cint); \ + SWAP1(lengths[d0], lengths[d1], cint); \ + if (bufAddr != NULL) \ + SWAP1(bufAddr[d0], bufAddr[d1], aint); \ } \ } #else #define SWAP(offsets, lengths, bufAddr, x, y) { \ int int4; \ - MPI_Aint aint; \ - MPI_Aint d0 = (x) - offsets; \ - MPI_Aint d1 = (y) - offsets; \ + MPI_Offset aint; \ + MPI_Offset d0 = (x) - offsets; \ + MPI_Offset d1 = (y) - offsets; \ if (d0 != d1) { \ - aint = *(x) ; *(x) = *(y) ; *(y) = aint ; \ - int4 = lengths[d0] ; lengths[d0] = lengths[d1] ; lengths[d1] = int4 ; \ - aint = bufAddr[d0] ; bufAddr[d0] = bufAddr[d1] ; bufAddr[d1] = aint ; \ + SWAP1(*(x), *(y), aint); \ + SWAP1(lengths[d0], lengths[d1], int4); \ + if (bufAddr != NULL) \ + SWAP1(bufAddr[d0], bufAddr[d1], aint); \ } \ } #endif @@ -71,28 +86,36 @@ ((*(b) < *(c)) ? (b) : ((*(a) < *(c)) ? (c) : (a))) : \ ((*(b) > *(c)) ? (b) : ((*(a) < *(c)) ? (a) : (c)))) +static +size_t bin_search( +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count key, MPI_Count *base, +#else + MPI_Offset key, MPI_Offset *base, +#endif + size_t nmemb); + /*----< qsort_off_len_buf() >------------------------------------------------*/ -/* Sort three arrays of offsets, lengths, and buffer addresses based on the - * increasing order of offsets. This code is based on the qsort routine from - * Bentley & McIlroy's "Engineering a Sort Function". +/* Sort three arrays of offsets, lengths, and buffer addresses based on array + * offsets into an increasing order. This code is based on the qsort routine + * from Bentley & McIlroy's "Engineering a Sort Function". */ static void -qsort_off_len_buf(MPI_Aint num, +qsort_off_len_buf(MPI_Aint num, #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *offsets, - MPI_Count *lengths, + MPI_Count *offsets, + MPI_Count *lengths, #else - MPI_Aint *offsets, - int *lengths, + MPI_Offset *offsets, + int *lengths, #endif - MPI_Aint *bufAddr) + MPI_Aint *bufAddr) { #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt; + MPI_Count *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt, i, r; #else - MPI_Aint *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt; + MPI_Offset *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt, i, r; #endif - MPI_Aint i, r; while (1) { swap_cnt = 0; @@ -155,7 +178,8 @@ qsort_off_len_buf(MPI_Aint num, if ((r = pd - pc) > 1) { /* Iterate rather than recursively call self to save stack space */ lengths = lengths + (num - r); - bufAddr = bufAddr + (num - r); + if (bufAddr != NULL) + bufAddr = bufAddr + (num - r); offsets = pn - r; num = r; } @@ -164,174 +188,238 @@ qsort_off_len_buf(MPI_Aint num, } } -/*----< ncmpio_init_intra_node_aggr() >--------------------------------------*/ -/* When intra-node write aggregation is enabled, processes on the same node - * will be divided into groups. The number of groups is the number of - * aggregators on that node. The rank IDs of each group must be established. +/*----< heap_merge() >-------------------------------------------------------*/ +/* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 modified for a + * heap with smallest element at root. The recursion has been removed so that + * there are no function calls. Function calls are too expensive. * - * 1. Find information about MPI processes and their affinity to compute node. - * 2. Determine whether self process is an intra-node aggregator. - * 3. For an aggregator, find the number of non-aggregators assigned to it and - * construct rank IDs of assigned non-aggregators. - * 4. For a non-aggregator, find the rank ID of its assigned aggregator. + * Requirement: all individual offsets lists must be already sorted !!! */ -int -ncmpio_intra_node_aggr_init(NC *ncp) +static +void heap_merge(int nprocs, + const MPI_Aint *count, /* [nprocs] */ + MPI_Aint nelems, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *offsets, /* [nelems] */ + MPI_Count *blklens, /* [nelems] */ +#else + MPI_Offset *offsets, /* [nelems] */ + int *blklens, /* [nelems] */ +#endif + MPI_Aint *bufAddr) /* [nelems] */ { - char my_procname[MPI_MAX_PROCESSOR_NAME], **all_procnames=NULL; - int i, j, k, my_procname_len, num_nodes, root=0; - int *node_ids=NULL, *all_procname_lens=NULL, *nprocs_per_node; - int naggrs_my_node, num_nonaggrs; - int my_rank_index, *ranks_my_node, my_node_id, nprocs_my_node; - - /* initialize parameters of local-node aggregation */ - ncp->my_aggr = -1; /* rank ID of my aggregator */ - ncp->num_nonaggrs = 0; /* number of non-aggregators assigned */ - ncp->nonaggr_ranks = NULL; /* ranks of assigned non-aggregators */ - -#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - ncp->aggr_time = 0.0; + typedef struct { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *off_list; + MPI_Count *len_list; +#else + MPI_Offset *off_list; + int *len_list; #endif + MPI_Aint *addr_list; + MPI_Aint count; + } heap_struct; - if (ncp->num_aggrs_per_node == 0 || ncp->num_aggrs_per_node == ncp->nprocs) - /* disable intra-node aggregation */ - return NC_NOERR; + heap_struct *a, tmp; + int i, j, heapsize, l, r, k, smallest; + size_t sum; -#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - double timing = MPI_Wtime(); + /* This heap_merge is not in-place, taking too much memory footprint */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *srt_off = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nelems); + MPI_Count *srt_len = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nelems); +#else + MPI_Aint *srt_off = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * nelems); + int *srt_len = (int*) NCI_Malloc(sizeof(int) * nelems); #endif + MPI_Aint *srt_addr = NULL; + + if (bufAddr != NULL) + srt_addr = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * nelems); + + a = (heap_struct *) NCI_Calloc(nprocs, sizeof(heap_struct)); + + /* there are nprocs number of lists to be merged */ + j = 0; + sum = 0; + for (i = 0; i < nprocs; i++) { + if (count[i]) { + /* each of a[j].off_list is already sorted */ + a[j].off_list = offsets + sum; + a[j].len_list = blklens + sum; + if (bufAddr != NULL) + a[j].addr_list = bufAddr + sum; + sum += count[i]; + a[j].count = count[i]; + j++; + } + } + nprocs = j; /* some count[i] may be zero */ - /* allocate space for storing the rank IDs of non-aggregators assigned to - * this rank. Note ncp->nonaggr_ranks[] will be freed when closing the - * file, if allocated. - */ - num_nonaggrs = ncp->nprocs / ncp->num_aggrs_per_node + 1; - ncp->nonaggr_ranks = (int*) NCI_Malloc(sizeof(int) * num_nonaggrs); +#define SWAP_HEAP(x, y, tmp) { tmp = x ; x = y ; y = tmp ; } - /* Collect info about compute nodes in order to select I/O aggregators. - * Note my_procname is null character terminated, but my_procname_len - * does not include the null character. + heapsize = nprocs; + + /* Build a heap out of the first element from each list, with the smallest + * element of the heap at the root. The first for loop is to find and move + * the smallest a[*].off_list[0] to a[0]. */ - MPI_Get_processor_name(my_procname, &my_procname_len); - my_procname_len++; /* to include terminate null character */ + for (i = heapsize / 2 - 1; i >= 0; i--) { + k = i; + for (;;) { + r = 2 * (k + 1); + l = r - 1; + if (l < heapsize && a[l].off_list[0] < a[k].off_list[0]) + smallest = l; + else + smallest = k; - if (ncp->rank == root) { - /* root collects all procnames */ - all_procnames = (char **) NCI_Malloc(sizeof(char*) * ncp->nprocs); - if (all_procnames == NULL) - DEBUG_RETURN_ERROR(NC_ENOMEM) + if (r < heapsize && a[r].off_list[0] < a[smallest].off_list[0]) + smallest = r; - all_procname_lens = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs); - if (all_procname_lens == NULL) { - NCI_Free(all_procnames); - DEBUG_RETURN_ERROR(NC_ENOMEM) + if (smallest != k) { + SWAP_HEAP(a[k], a[smallest], tmp); + k = smallest; + } else + break; } } - /* gather process name lengths from all processes first */ - MPI_Gather(&my_procname_len, 1, MPI_INT, all_procname_lens, 1, MPI_INT, - root, ncp->comm); - if (ncp->rank == root) { - int *disp; - size_t alloc_size = 0; + /* The heap keeps the smallest element in its first element, i.e. + * a[0].off_list[0]. + */ + j = 0; + for (i = 0; i < nelems; i++) { + /* extract smallest element from heap, i.e. the root */ + srt_off[i] = a[0].off_list[0]; + srt_len[i] = a[0].len_list[0]; + if (bufAddr != NULL) + srt_addr[i] = a[0].addr_list[0]; + a[0].count--; + + if (!a[0].count) { + a[0] = a[heapsize - 1]; + heapsize--; + } else { + a[0].off_list++; + a[0].len_list++; + if (bufAddr != NULL) + a[0].addr_list++; + } - for (i=0; inprocs; i++) - alloc_size += all_procname_lens[i]; + /* Heapify(a, 0, heapsize); */ + k = 0; + for (;;) { + r = 2 * (k + 1); + l = r - 1; + if (l < heapsize && a[l].off_list[0] < a[k].off_list[0]) + smallest = l; + else + smallest = k; - all_procnames[0] = (char *) NCI_Malloc(alloc_size); - if (all_procnames[0] == NULL) { - NCI_Free(all_procname_lens); - NCI_Free(all_procnames); - DEBUG_RETURN_ERROR(NC_ENOMEM) - } + if (r < heapsize && a[r].off_list[0] < a[smallest].off_list[0]) + smallest = r; - /* Construct displacement array for the MPI_Gatherv, as each process - * may have a different length for its process name. - */ - disp = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs); - disp[0] = 0; - for (i=1; inprocs; i++) { - all_procnames[i] = all_procnames[i - 1] + all_procname_lens[i - 1]; - disp[i] = disp[i - 1] + all_procname_lens[i - 1]; + if (smallest != k) { + SWAP_HEAP(a[k], a[smallest], tmp); + k = smallest; + } else + break; } + } - /* gather all process names */ - MPI_Gatherv(my_procname, my_procname_len, MPI_CHAR, - all_procnames[0], all_procname_lens, disp, MPI_CHAR, - root, ncp->comm); +#ifdef HAVE_MPI_LARGE_COUNT + memcpy(offsets, srt_off, sizeof(MPI_Count) * nelems); + memcpy(blklens, srt_len, sizeof(MPI_Count) * nelems); +#else + memcpy(offsets, srt_off, sizeof(MPI_Offset) * nelems); + memcpy(blklens, srt_len, sizeof(int) * nelems); +#endif + if (bufAddr != NULL) + memcpy(bufAddr, srt_addr, sizeof(MPI_Aint) * nelems); + + NCI_Free(a); + if (bufAddr != NULL) NCI_Free(srt_addr); + NCI_Free(srt_len); + NCI_Free(srt_off); +} - NCI_Free(disp); - NCI_Free(all_procname_lens); - } else - /* send process name to root */ - MPI_Gatherv(my_procname, my_procname_len, MPI_CHAR, - NULL, NULL, NULL, MPI_CHAR, root, ncp->comm); - - /* each MPI process's compute node ID */ - node_ids = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs); - - if (ncp->rank == root) { - /* all_procnames[] can tell us the number of nodes and number of - * processes per node. - */ - char **node_names; - int last; - - /* array of pointers pointing to unique host names (compute nodes) */ - node_names = (char **) NCI_Malloc(sizeof(char*) * ncp->nprocs); - - /* number of MPI processes running on each node */ - nprocs_per_node = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs); - - /* calculate nprocs_per_node[] and node_ids[] */ - last = 0; - num_nodes = 0; /* number of unique compute nodes */ - for (i=0; inprocs; i++) { - k = last; - for (j=0; j--------------------------------------------------*/ +/* When intra-node write aggregation is enabled, this subroutine initializes + * the metadata to be used for intra-node communication and I/O requests. + * + * Processes on the same node will first be divided into groups. A process with + * the lowest rank ID in a group is selected as the aggregator. Only the + * aggregators call the MPI-IO functions to perform I/O to the file. Thus, this + * subroutine must be called before MPI_File_open() and should be called only + * once at ncmpio_create() or ncmpio_open(). + * + * The subroutine performs the following tasks. + * 1. Make use of the affinity of each MPI process to its compute node, + * represented by ncp->num_nodes and ncp->node_ids[]. These two member of + * ncp should have been set from a call to ncmpii_construct_node_list() + * earlier during ncmpio_create() and ncmpio_open(). + * + ncp->num_nodes is the number of unique compute nodes. + * + ncp->node_ids[ncp->nprocs] contains node IDs for all processes. + * 2. Divide processes into groups, select aggregators, and determine whether + * self process is an intra-node aggregator. + * + ncp->my_aggr is rank ID of my aggregator. + * + if (ncp->my_aggr == ncp->rank) then this rank is an aggregator. + * 3. For an aggregator, find the number of non-aggregators assigned to it and + * construct a list of rank IDs of non-aggregators of its group. + * + ncp->num_nonaggrs is the number of non-aggregators in its group. + * 4. For a non-aggregator, find the rank ID of its assigned aggregator. + * + ncp->my_aggr is rank ID of my aggregator. + * + ncp->nonaggr_ranks[] contains the rank IDs of assigned non-aggregators. + * 5. Create a new MPI communicator consisting of only the aggregators only. + * Obtain the rank ID and total process number of the new communicator. + * + ncp->ina_comm contains the aggregators across all nodes. + * + ncp->ina_nprocs is the number of processes in intra-node communicator. + * + ncp->ina_rank is this process's rank ID in intra-node communicator. + */ +int +ncmpio_ina_init(NC *ncp) +{ + int i, j, mpireturn, do_io, ina_nprocs, naggrs_my_node, first_rank; + int my_rank_index, *ranks_my_node, my_node_id, nprocs_my_node; - for (i=0; iina_time_put) / sizeof(ncp->ina_time_put[0]); + ncp->ina_time_init = ncp->ina_time_flatten = 0.0; + for (i=0; iina_time_put[i] = ncp->ina_time_get[i] = 0; + ncp->maxmem_put[i] = ncp->maxmem_get[i] = 0; } + ncp->ina_npairs_put = ncp->ina_npairs_get = 0; +#endif + + /* initialize parameters of intra-node aggregation */ + ncp->my_aggr = -1; /* rank ID of my aggregator */ + ncp->num_nonaggrs = 0; /* number of non-aggregators assigned */ + ncp->nonaggr_ranks = NULL; /* ranks of assigned non-aggregators */ - MPI_Bcast(node_ids, ncp->nprocs, MPI_INT, root, ncp->comm); + /* Note that ill value of ncp->num_aggrs_per_node has been checked before + * entering this subroutine. Thus ncp->num_aggrs_per_node must be > 0. + */ - /* my_node_id is this rank's node ID */ - my_node_id = node_ids[ncp->rank]; + /* ncp->node_ids[] has been established in ncmpii_construct_node_list() + * called in ncmpio_create() or ncmpio_open() before entering this + * subroutine. my_node_id is this rank's node ID. + */ + my_node_id = ncp->node_ids[ncp->rank]; - /* nprocs_my_node: the number of processes in my nodes + /* nprocs_my_node: the number of processes in my nodes * ranks_my_node[]: rank IDs of all processes in my node. - * my_rank_index points to ranks_my_node[] where - * ranks_my_node[my_rank_index] == ncp->rank + * my_rank_index: points to ranks_my_node[] where + * ranks_my_node[my_rank_index] == ncp->rank */ ranks_my_node = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs); my_rank_index = -1; nprocs_my_node = 0; for (i=0; inprocs; i++) { - if (node_ids[i] == my_node_id) { + if (ncp->node_ids[i] == my_node_id) { if (i == ncp->rank) my_rank_index = nprocs_my_node; ranks_my_node[nprocs_my_node] = i; @@ -339,80 +427,166 @@ ncmpio_intra_node_aggr_init(NC *ncp) } } assert(my_rank_index >= 0); - /* Now, ranks_my_node[my_rank_index] == ncp->rank */ - NCI_Free(node_ids); - - /* make sure number of aggregators in my node <= nprocs_my_node */ + /* Make sure number of aggregators in my node <= nprocs_my_node. In some + * cases, the number of processes allocated to the last few nodes can be + * less than others. + */ naggrs_my_node = MIN(ncp->num_aggrs_per_node, nprocs_my_node); - /* calculate the number of non-aggregators assigned to an aggregator. - * Note num_nonaggrs includes self. + /* For each aggregation group, calculate the number of non-aggregators, + * ncp->num_nonaggrs. Note ncp->num_nonaggrs includes self rank. */ - num_nonaggrs = nprocs_my_node / naggrs_my_node; - if (nprocs_my_node % naggrs_my_node) num_nonaggrs++; - - if (num_nonaggrs == 1) - /* disable aggregation if the number of non-aggregators assigned to - * this aggregator is 1. Note num_nonaggrs includes self. It is - * possible for aggregation enabled or disabled on different nodes and - * even different aggregation groups on the same node. + ncp->num_nonaggrs = nprocs_my_node / naggrs_my_node; + if (nprocs_my_node % naggrs_my_node) ncp->num_nonaggrs++; + + /* Adjust the number of non-aggregators for the last group of each node, + * to make sure it does not go beyond nprocs_my_node. + */ + first_rank = my_rank_index - my_rank_index % ncp->num_nonaggrs; + ncp->num_nonaggrs = MIN(ncp->num_nonaggrs, nprocs_my_node - first_rank); + + /* Assign the first rank as the intra-node aggregator of this group and + * set the rank ID of my aggregator for each process. + */ + ncp->my_aggr = ranks_my_node[first_rank]; + + if (ncp->num_nonaggrs == 1) { + /* When the number of processes in this group is 1, the aggregation + * is not performed. Note num_nonaggrs includes self rank. * - * Use whether ncp->my_aggr < 0 to tell if aggregation is disabled or - * enabled. + * Note this does not mean intra-node aggregation is disabled. The + * indicator of whether intra-node aggregation is enabled or disabled + * is ncp->num_aggrs_per_node, whose value should be consistent across + * all processes. It is possible for some groups containing only one + * process, in which the aggregation is not necessarily performed + * within that group. */ - ncp->my_aggr = -1; - else { - /* find the rank ID of aggregator assigned to this rank */ - ncp->my_aggr = ranks_my_node[my_rank_index - my_rank_index % num_nonaggrs]; + assert(ncp->my_aggr == ncp->rank); + } + else if (ncp->my_aggr == ncp->rank) { /* ncp->num_nonaggrs > 1 */ + /* Construct ncp->nonaggr_ranks[], the rank IDs of non-aggregators of + * this group. Note ncp->nonaggr_ranks[], if malloc-ed, will only be + * freed when closing the file. + */ + ncp->nonaggr_ranks = (int*)NCI_Malloc(sizeof(int) * ncp->num_nonaggrs); - if (ncp->my_aggr == ncp->rank) { /* this rank is an aggregator */ - /* Set the number of non-aggregators assigned to this rank. For the - * last group, make sure it does not go beyond nprocs_my_node. + memcpy(ncp->nonaggr_ranks, ranks_my_node + first_rank, + sizeof(int) * ncp->num_nonaggrs); + } + NCI_Free(ranks_my_node); + + /* Next step is to construct a new MPI communicator consisting of all + * intra-node aggregators. It will later be used to call MPI_File_open(), + * so that only aggregators call MPI-IO functions to access the file. + * + * When using the PnetCDF's internal PNCIO driver, we can pass a list of + * node_ids of the new communicator to the PNCIO file handler, + * ncp->pncio_fh, so to prevent the driver from the repeated work of + * constructing the list of node IDs, node_ids. If using MPI-IO driver, + * then ROMIO will do this internally again anyway. + */ + + do_io = (ncp->my_aggr == ncp->rank) ? 1 : 0; + + /* construct an array containing ranks of aggregators */ + ncp->ina_node_list = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs); + TRACE_COMM(MPI_Allgather)(&do_io, 1, MPI_INT, ncp->ina_node_list, 1, + MPI_INT,ncp->comm); + + /* Calculate the total number of intra-node aggregators */ + for (ina_nprocs=0, i=0; inprocs; i++) + if (ncp->ina_node_list[i]) ina_nprocs++; + + /* Construct ncp->node_ids[] and ncp->ina_node_list[]. Their contents + * depend on the layout of MPI process allocation to the compute nodes. + * The common layouts can be two kinds: + * + cyclic - MPI ranks are assigned to nodes round-robin-ly, + * + block - MPI ranks are assigned to a node and then move on to next. + * + * Below uses an example of nodes=3, nprocs=10, * num_aggrs_per_node=2. + * ncp->node_ids[] should be + * block process allocation: 0,0,0,0,1,1,1,2,2,2 + * cyclic process allocation: 0,1,2,0,1,2,0,1,2,0 + * Accordingly, ncp->ina_node_list[] can be two kinds + * block process allocation: 1,0,1,0,1,0,1,1,0,1 + * cyclic process allocation: 1,1,1,0,0,0,1,1,1,0 + */ + + /* ncp->node_ids[]: node IDs of processes in the new MPI communicator. + * ncp->ina_node_list[]: the rank IDs of the new MPI communicator. + */ + for (j=0,i=0; inprocs; i++) { + if (ncp->ina_node_list[i]) { + ncp->ina_node_list[j] = i; + /* Modify ncp->node_ids[] to store the node IDs of the processes in + * the new communicator. Note ncp->node_ids[] from now on is used + * by PnetCDF's PNCIO driver only. */ - ncp->num_nonaggrs = MIN(num_nonaggrs, nprocs_my_node - my_rank_index); - if (ncp->num_nonaggrs == 1) - /* disable aggregation, as this aggregation group contains only - * self rank - */ - ncp->my_aggr = -1; - else - /* copy the rank IDs over to ncp->nonaggr_ranks[] */ - memcpy(ncp->nonaggr_ranks, - ranks_my_node + my_rank_index, - sizeof(int) * num_nonaggrs); + ncp->node_ids[j] = ncp->node_ids[i]; + j++; } } - NCI_Free(ranks_my_node); - if (ncp->my_aggr < 0) { - /* free ncp->nonaggr_ranks if aggregation is not enabled */ - NCI_Free(ncp->nonaggr_ranks); - ncp->nonaggr_ranks = NULL; + /* Make MPI calls to create a new communicator. */ + MPI_Group origin_group, ina_group; + TRACE_COMM(MPI_Comm_group)(ncp->comm, &origin_group); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Comm_group"); + TRACE_COMM(MPI_Group_incl)(origin_group, ina_nprocs, ncp->ina_node_list, &ina_group); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Group_incl"); + TRACE_COMM(MPI_Comm_create)(ncp->comm, ina_group, &ncp->ina_comm); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Comm_create"); + TRACE_COMM(MPI_Group_free)(&ina_group); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Group_free"); + TRACE_COMM(MPI_Group_free)(&origin_group); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Group_free"); + + /* Non-aggregators will have ncp->ina_comm set to MPI_COMM_NULL */ + if (ncp->ina_comm == MPI_COMM_NULL) { + ncp->ina_nprocs = 0; + ncp->ina_rank = -1; + } + else { + MPI_Comm_size(ncp->ina_comm, &ncp->ina_nprocs); + MPI_Comm_rank(ncp->ina_comm, &ncp->ina_rank); } - /* TODO: For automatically determine Whether to enable intra-node write - * aggregation, this should be done right before each collective write - * call. - * 1. obtain hint cb_noddes, and striping_unit + /* TODO: automatically determine whether or not to enable intra-node + * aggregation. + * + * The ideal case is it can be determined right before each collective + * write call, because only at that time, the communication pattern is + * known. If the pattern can cause contention, then enable it. Otherwise, + * disable it. + * + * Such mechanism may depends on the followings. + * 1. MPI-IO hint cb_noddes, and striping_unit * 2. calculate aggregate access region - * In each round of two-phase I/O, when the number of senders to each - * cb_nodes is very large, then intra-node aggregation should be enabled. - * Average of all nprocs_per_node may be a factor for determining whether - * to enable intra-node aggregation. It indicates whether the high number - * of processes are allocated on the same node. + * 3. If the number of senders to each cb_nodes is very large, then + * intra-node aggregation should be enabled. + * 4. Average of nprocs_per_node across all processes may be a factor for + * determining whether to enable intra-node aggregation. It indicates + * whether the high number of processes are allocated on the same + * node. */ #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - ncp->aggr_time = MPI_Wtime() - timing; + ncp->ina_time_init = MPI_Wtime() - timing; #endif return NC_NOERR; } /*----< flatten_subarray() >-------------------------------------------------*/ -/* flatten a subarray request into a list of offset-length pairs */ +/* Flatten a subarray request, specified by start[], count[], and stride[] into + * a list of file offset-length pairs, offsets[] and lengths[]. + */ static int flatten_subarray(int ndim, /* number of dimensions */ int el_size, /* array element size */ @@ -426,7 +600,7 @@ flatten_subarray(int ndim, /* number of dimensions */ MPI_Count *offsets, /* OUT: array of offsets */ MPI_Count *lengths /* OUT: array of lengths */ #else - MPI_Aint *offsets, /* OUT: array of offsets */ + MPI_Offset *offsets, /* OUT: array of offsets */ int *lengths /* OUT: array of lengths */ #endif ) @@ -503,12 +677,26 @@ flatten_subarray(int ndim, /* number of dimensions */ subarray_len *= count[ndim]; } + /* check if the list can be coalesced */ + for (i=0, j=1; j<*npairs; j++) { + if (offsets[i] + lengths[i] == offsets[j]) + lengths[i] += lengths[j]; + else { + i++; + if (i < j) { + offsets[i] = offsets[j]; + lengths[i] = lengths[j]; + } + } + } + *npairs = i + 1; + return NC_NOERR; } -/*----< flatten_req() >-----------------------------------------------------*/ -/* flatten one write request into offset-length pairs. - * offsets and lengths are allocated here and need to be freed by the caller +/*----< flatten_req() >------------------------------------------------------*/ +/* Flatten one subarray request into offset-length pairs. Arrays offsets and + * lengths are allocated in this subroutine and need to be freed by the caller. */ static int flatten_req(NC *ncp, @@ -516,19 +704,29 @@ flatten_req(NC *ncp, const MPI_Offset *start, const MPI_Offset *count, const MPI_Offset *stride, + int *is_incr, /* OUT: are offsets incrementing */ MPI_Aint *num_pairs, /* OUT: number of off-len pairs */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count **offsets, /* OUT: array of flattened offsets */ - MPI_Count **lengths /* OUT: array of flattened lengths */ + MPI_Count **off_ptr, /* OUT: array of flattened offsets */ + MPI_Count **len_ptr /* OUT: array of flattened lengths */ #else - MPI_Aint **offsets, /* OUT: array of flattened offsets */ - int **lengths /* OUT: array of flattened lengths */ + MPI_Offset **off_ptr, /* OUT: array of flattened offsets */ + int **len_ptr /* OUT: array of flattened lengths */ #endif ) { - int j, err=NC_NOERR, ndims; + int i, j, err=NC_NOERR, ndims; MPI_Aint num, idx; MPI_Offset var_begin, *shape, count0, *ones=NULL; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count prev_end_off; + MPI_Count *offsets; + MPI_Count *lengths; +#else + MPI_Offset prev_end_off; + MPI_Offset *offsets; + int *lengths; +#endif *num_pairs = 0; /* total number of offset-length pairs */ @@ -537,15 +735,17 @@ flatten_req(NC *ncp, */ if (varp->ndims == 0) { /* scalar variable */ #ifdef HAVE_MPI_LARGE_COUNT - *offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count)); - *lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count)); + offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * 2); + lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * 2); #else - *offsets = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint)); - *lengths = (int*) NCI_Malloc(sizeof(int)); + offsets = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * 2); + lengths = (int*) NCI_Malloc(sizeof(int) * 2); #endif - (*offsets)[0] = varp->begin; - (*lengths)[0] = varp->xsz; + offsets[0] = varp->begin; + lengths[0] = varp->xsz; *num_pairs = 1; + *off_ptr = offsets; + *len_ptr = lengths; return NC_NOERR; } else if (varp->ndims == 1 && IS_RECVAR(varp)) { /* scalar variable */ @@ -555,22 +755,24 @@ flatten_req(NC *ncp, num = 1; if (stride != NULL && stride[varp->ndims-1] > 1) num = count[varp->ndims-1]; /* count of last dimension */ - for (j=0; jndims-1; j++) - num *= count[j]; /* all count[] except the last dimension */ + for (i=0; indims-1; i++) + num *= count[i]; /* all count[] except the last dimension */ } *num_pairs = num; #ifdef HAVE_MPI_LARGE_COUNT - *offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num); - *lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num); + offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (num+1)); + lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (num+1)); #else - *offsets = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint) * num); - *lengths = (int*) NCI_Malloc(sizeof(int) * num); + offsets = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * (num+1)); + lengths = (int*) NCI_Malloc(sizeof(int) * (num+1)); #endif + *off_ptr = offsets; + *len_ptr = lengths; if (stride == NULL) { /* equivalent to {1, 1, ..., 1} */ ones = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * varp->ndims); - for (j=0; jndims; j++) ones[j] = 1; + for (i=0; indims; i++) ones[i] = 1; } ndims = varp->ndims; @@ -589,13 +791,26 @@ flatten_req(NC *ncp, count0 = 1; idx = 0; - for (j=0; jxsz, var_begin, shape, start, count, (stride == NULL) ? ones : stride, - &num, /* OUT: num of off-len pairs */ - *offsets + idx, /* OUT: array of offsets */ - *lengths + idx); /* OUT: array of lengths */ + &num, /* OUT: num of off-len pairs */ + offsets + idx, /* OUT: array of offsets */ + lengths + idx); /* OUT: array of lengths */ + + if (num == 0) continue; + + /* check if offsets[] are in an increasing order */ + for (j=0; j offsets[idx+j]) + *is_incr = 0; /* offsets are not incrementing */ + else + prev_end_off = offsets[idx+j]; + } + idx += num; assert(idx <= *num_pairs); @@ -605,30 +820,46 @@ flatten_req(NC *ncp, if (ones != NULL) NCI_Free(ones); + /* num_pairs may be less than originally calculated, because offset-length + * pairs are coalesced in the call to flatten_subarray(). + */ + *num_pairs = idx; + return err; } /*----< flatten_reqs() >-----------------------------------------------------*/ -/* flatten all write requests into offset-length pairs. - * offsets and lengths are allocated here and need to be freed by the caller +/* Flatten multiple subarray requests into file offset-length pairs. Arrays + * offsets and lengths are allocated here and need to be freed by the caller. */ static int flatten_reqs(NC *ncp, + int reqMode, /* IN: NC_REQ_RD or NC_REQ_WR */ int num_reqs, /* IN: # requests */ const NC_req *reqs, /* [num_reqs] requests */ + int *is_incr, /* OUT: are offsets incrementing */ MPI_Aint *num_pairs, /* OUT: total number of off-len pairs */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count **offsets, /* OUT: array of flattened offsets */ - MPI_Count **lengths /* OUT: array of flattened lengths */ + MPI_Count **off_ptr, /* OUT: array of flattened offsets */ + MPI_Count **len_ptr /* OUT: array of flattened lengths */ #else - MPI_Aint **offsets, /* OUT: array of flattened offsets */ - int **lengths /* OUT: array of flattened lengths */ + MPI_Offset **off_ptr, /* OUT: array of flattened offsets */ + int **len_ptr /* OUT: array of flattened lengths */ #endif ) { int i, j, status=NC_NOERR, ndims, max_ndims=0; MPI_Aint num, idx; MPI_Offset *start, *count, *shape, *stride, *ones; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count prev_end_off; + MPI_Count *offsets; + MPI_Count *lengths; +#else + MPI_Offset prev_end_off; + MPI_Offset *offsets; + int *lengths; +#endif *num_pairs = 0; /* total number of offset-length pairs */ @@ -636,57 +867,60 @@ flatten_reqs(NC *ncp, * contiguous memory space for storing off-len pairs */ for (i=0; iput_lead_list + reqs[i].lead_off; - ndims = lead->varp->ndims; - max_ndims = MAX(max_ndims, ndims); - if (ndims > 0) { - start = reqs[i].start; - count = start + ndims; - stride = count + ndims; - } + /* reqs[i].npairs is the number of offset-length pairs of this request, + * calculated in ncmpio_igetput_varm() and igetput_varn() + */ + *num_pairs += reqs[i].npairs; + if (fIsSet(reqMode, NC_REQ_WR)) + ndims = ncp->put_lead_list[reqs[i].lead_off].varp->ndims; else - start = count = stride = NULL; - - /* for record variable, each reqs[] is within a record */ - if (IS_RECVAR(lead->varp)) { - ndims--; - start++; - count++; - stride++; - } - if (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) stride = NULL; - - if (ndims < 0) continue; - if (ndims == 0) { /* 1D record variable */ - (*num_pairs)++; - continue; - } - num = 1; - if (stride != NULL && stride[ndims-1] > 1) - num = count[ndims-1]; /* count of last dimension */ - for (j=0; jget_lead_list[reqs[i].lead_off].varp->ndims; + max_ndims = MAX(max_ndims, ndims); } /* now we can allocate a contiguous memory space for the off-len pairs */ #ifdef HAVE_MPI_LARGE_COUNT - *offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs)); - *lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs)); + offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs+1)); + lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs+1)); #else - *offsets = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint) * (*num_pairs)); - *lengths = (int*) NCI_Malloc(sizeof(int) * (*num_pairs)); + offsets = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * (*num_pairs+1)); + lengths = (int*) NCI_Malloc(sizeof(int) * (*num_pairs+1)); #endif - idx = 0; + *off_ptr = offsets; + *len_ptr = lengths; ones = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * max_ndims); for (i=0; iput_lead_list + reqs[i].lead_off; + NC_lead_req *lead; + if (fIsSet(reqMode, NC_REQ_WR)) + lead = ncp->put_lead_list + reqs[i].lead_off; + else + lead = ncp->get_lead_list + reqs[i].lead_off; + + if (reqs[i].npairs == 1) { + /* When reqs[i] contains only one offset-length pair, re-use + * reqs[i].offset_start, which has been generated earlier at a call + * to ncmpio_intra_node_aggregation_nreqs(). + */ + offsets[idx] = reqs[i].offset_start; + lengths[idx] = reqs[i].nelems * lead->varp->xsz; + + /* check if offsets[] are in an increasing order */ + if (prev_end_off > offsets[idx]) + *is_incr = 0; /* offsets are not incrementing */ + else + prev_end_off = offsets[idx]; + idx++; + continue; + } ndims = lead->varp->ndims; if (ndims > 0) { @@ -715,20 +949,37 @@ flatten_reqs(NC *ncp, if (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) stride = NULL; - /* flatten each request into a list of offset-length pairs and - * append to the end of offsets and lengths + /* flatten each request into a list of offset-length pairs and append + * to the end of offsets and lengths */ flatten_subarray(ndims, lead->varp->xsz, var_begin, shape, start, count, (stride == NULL) ? ones : stride, - &num, /* OUT: number of off-len pairs */ - *offsets + idx, /* OUT: array of offsets */ - *lengths + idx); /* OUT: array of lengths */ + &num, /* OUT: number of off-len pairs */ + offsets + idx, /* OUT: array of offsets */ + lengths + idx); /* OUT: array of lengths */ + + /* check if offsets[] are in an increasing order */ + for (j=0; j offsets[idx+j]) + *is_incr = 0; /* offsets are not incrementing */ + else + prev_end_off = offsets[idx+j]; + } idx += num; } NCI_Free(ones); + /* num_pairs may be less than originally calculated, because offset-length + * pairs are coalesced in the call to flatten_subarray(). + */ + *num_pairs = idx; + for (i=0; iput_lead_list + reqs[i].lead_off; + NC_lead_req *lead; + if (fIsSet(reqMode, NC_REQ_WR)) + lead = ncp->put_lead_list + reqs[i].lead_off; + else + lead = ncp->get_lead_list + reqs[i].lead_off; if (fIsSet(lead->flag, NC_REQ_TO_FREE)) { NCI_Free(lead->start); lead->start = NULL; @@ -738,187 +989,434 @@ flatten_reqs(NC *ncp, return status; } -/*----< construct_buf_type() >-----------------------------------------------*/ -/* construct an MPI derived datatype for I/O buffers from the request list, by - * concatenate all buffers. +/*----< flat_buf_type() >----------------------------------------------------*/ +/* Scan the nonblocking requests, pointed by reqs, and build the offset-length + * pairs of all buffers, xbuf. Note xbuf in each nonblocking request is a + * contiguous buffer (packed from the user buffer for the write operations). + * For record variables, if a user request is accessing more than one record, + * the request is split into into multiple NC_req objects, one for each record. */ static int -construct_buf_type(const NC *ncp, - int num_reqs, /* IN: # requests */ - const NC_req *reqs, /* [num_reqs] requests */ - MPI_Aint *bufLen, /* OUT: buffer size in bytes */ - MPI_Datatype *bufType) /* OUT: buffer datatype */ +flat_buf_type(const NC *ncp, + int reqMode, /* IN: NC_REQ_RD or NC_REQ_WR */ + int num_reqs, /* IN: # requests */ + const NC_req *reqs, /* IN: [num_reqs] requests */ + PNCIO_View *buf_view, /* OUT: flattened buftype */ + void **buf) /* OUT: pointer to I/O buffer */ +/* TODO: */ +#if 1 { - int i, err, mpireturn, status=NC_NOERR; + int i, j, err=NC_NOERR; NC_lead_req *lead; + MPI_Aint addr, addr0; +/* buffer offset should be of type MPI_Aint. length should be size_t. */ + + buf_view->type = MPI_BYTE; + buf_view->size = 0; + buf_view->count = 0; + buf_view->off = NULL; + buf_view->len = NULL; + buf_view->is_contig = 1; + if (num_reqs == 0) + return NC_NOERR; + + buf_view->off = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * num_reqs); #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *blocklens = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs); - MPI_Count *disps = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs); + buf_view->len = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * num_reqs); #else - int *blocklens = (int*) NCI_Malloc(sizeof(int) * num_reqs); - MPI_Aint *disps = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * num_reqs); + buf_view->len = (int*) NCI_Malloc(sizeof(int) * num_reqs); +#endif + +#if 1 + *buf = reqs[0].xbuf; + + lead = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list + : ncp->get_lead_list; + + MPI_Get_address(lead[reqs[0].lead_off].xbuf, &addr0); +// printf("%s at %d: lead xbuf=%ld nelems=%lld\n",__func__,__LINE__, addr0,lead[reqs[0].lead_off].nelems); + +// assert(reqs[0].xbuf == lead[reqs[0].lead_off].xbuf); + + /* set buf_view->off[0] and buf_view->len[0] */ + MPI_Get_address(reqs[0].xbuf, &addr0); /* displacement uses MPI_BOTTOM */ + buf_view->off[0] = 0; + + /* buf_view->len[] are in bytes */ + buf_view->len[0] = reqs[0].nelems * lead[reqs[0].lead_off].varp->xsz; +#if 0 +printf("%s at %d: buf_view->len[0]=%lld nelems=%lld\n",__func__,__LINE__, buf_view->len[0],reqs[0].nelems); +j=0; +printf("%s at %d: buf_view xbuf=%ld off[%d]=%lld nelems=%lld\n",__func__,__LINE__, addr0,j,buf_view->off[j],reqs[0].nelems); #endif - *bufLen = 0; - for (i=0; ilen[j]); +nelems=buf_view->len[j]/4; xbuf = (char*)reqs[j].xbuf + buf_view->off[j]; +memcpy(wkl, xbuf, nelems*4); ncmpii_in_swapn(wkl, nelems, 4); +printf("%s at %d: nelems=%d off=%lld buf=(%p) ",__func__,__LINE__, nelems, buf_view->off[j], xbuf); +for (i=0; isize = buf_view->len[0]; + for (i=0, j=1; joff[j] = addr - addr0; - /* blocklens[] in bytes */ - lead = ncp->put_lead_list + reqs[i].lead_off; - blocklens[i] = reqs[i].nelems * lead->varp->xsz; +// printf("%s at %d: buf_view xbuf=%ld off[%d]=%lld nelems=%lld\n",__func__,__LINE__, addr,j,buf_view->off[j],reqs[j].nelems); - *bufLen += blocklens[i]; - } +// assert(reqs[j].xbuf == lead[reqs[j].lead_off].xbuf); + /* buf_view->len[] are in bytes */ + buf_view->len[j] = reqs[j].nelems * lead[reqs[j].lead_off].varp->xsz; - /* construct buffer derived datatype */ -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps, - MPI_BYTE, bufType); +/* +wkl = (int*) malloc(buf_view->len[j]); +nelems=buf_view->len[j]/4; +xbuf = (char*)reqs[j].xbuf; // + buf_view->off[j]; +xbuf = (char*)(*buf) + buf_view->off[j]; +memcpy(wkl, xbuf, nelems*4); ncmpii_in_swapn(wkl, nelems, 4); +printf("%s at %d: nelems=%d off=%lld buf=(%p) ",__func__,__LINE__, nelems, buf_view->off[j], xbuf); +for (i=0; isize += buf_view->len[j]; + + /* coalesce the off-len pairs */ + if (buf_view->off[i] + buf_view->len[i] == buf_view->off[j]) + buf_view->len[i] += buf_view->len[j]; + else { + i++; + if (i < j) { + buf_view->off[i] = buf_view->off[j]; + buf_view->len[i] = buf_view->len[j]; + } + } + } + /* After coalescing, the true number of requests may be reduced */ +// printf("%s at %d: buf_view->size=%lld\n",__func__,__LINE__, buf_view->size); #else - mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps, - MPI_BYTE, bufType); -#endif - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; + /* set buf_view->off[0] and buf_view->len[0] */ + MPI_Get_address(reqs[0].xbuf, &addr); /* displacement uses MPI_BOTTOM */ + buf_view->off[0] = addr; + + lead = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list + : ncp->get_lead_list; + + /* buf_view->len[] are in bytes */ + buf_view->len[0] = reqs[0].nelems * lead[reqs[0].lead_off].varp->xsz; + ? *buf = lead[reqs[0].lead_off].xbuf; + + buf_view->size = buf_view->len[0]; + for (i=0, j=1; joff[j] = addr; - *bufType = MPI_DATATYPE_NULL; + /* buf_view->len[] are in bytes */ + buf_view->len[j] = reqs[j].nelems * lead[reqs[j].lead_off].varp->xsz; + + /* accumulate buffer type size */ + buf_view->size += buf_view->len[j]; + + /* coalesce the off-len pairs */ + if (buf_view->off[i] + buf_view->len[i] == buf_view->off[j]) + buf_view->len[i] += buf_view->len[j]; + else { + i++; + if (i < j) { + buf_view->off[i] = buf_view->off[j]; + buf_view->len[i] = buf_view->len[j]; + } + } } - else { - MPI_Type_commit(bufType); + /* After coalescing, the true number of requests may be reduced */ +#endif + + if (i + 1 < num_reqs) { + num_reqs = i + 1; /* num_reqs is reduced */ + buf_view->off = (MPI_Offset*)NCI_Realloc(buf_view->off, + sizeof(MPI_Offset) * num_reqs); #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count typeSize; - MPI_Type_size_c(*bufType, &typeSize); + buf_view->len = (MPI_Offset*)NCI_Realloc(buf_view->len, + sizeof(MPI_Offset) * num_reqs); #else - int typeSize; - MPI_Type_size(*bufType, &typeSize); + buf_view->len = (int*) NCI_Realloc(buf_view->len, + sizeof(int) * num_reqs); #endif - assert(typeSize == *bufLen); } - NCI_Free(blocklens); - NCI_Free(disps); - - return status; -} + buf_view->count = num_reqs; + buf_view->is_contig = (num_reqs <= 1); -/*----< intra_node_aggregation() >-------------------------------------------*/ -/* This is a collective call */ -static int -intra_node_aggregation(NC *ncp, - MPI_Aint num_pairs, + /* construct buf_view->type if it is noncontiguous */ + if (num_reqs > 1) { + int mpireturn; #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *offsets, - MPI_Count *lengths, + mpireturn = MPI_Type_create_hindexed_c(num_reqs, buf_view->len, + buf_view->off, MPI_BYTE, + &buf_view->type); +#else + MPI_Aint *disp; +#if SIZEOF_MPI_AINT == SIZEOF_MPI_OFFSET + disp = (MPI_Aint*) buf_view->off; #else - MPI_Aint *offsets, - int *lengths, + disp = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * num_reqs); + for (j=0; joff[j]; +#endif + + mpireturn = MPI_Type_create_hindexed(num_reqs, buf_view->len, disp, + MPI_BYTE, &buf_view->type); +#if SIZEOF_MPI_AINT != SIZEOF_MPI_OFFSET + NCI_Free(disp); +#endif #endif - MPI_Offset bufCount, - MPI_Datatype bufType, - void *buf) + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed"); + + buf_view->type = MPI_BYTE; + NCI_Free(buf_view->off); + NCI_Free(buf_view->len); + buf_view->off = NULL; + buf_view->len = NULL; + buf_view->count = 0; + buf_view->size = 0; + } + else { + MPI_Type_commit(&buf_view->type); + } + } + + return err; +} +#else { - int i, j, err, mpireturn, status=NC_NOERR, nreqs; - char *recv_buf=NULL, *wr_buf = NULL; - MPI_Aint npairs=0, *msg; - MPI_Offset offset=0, buf_count; - MPI_Datatype recvTypes, fileType=MPI_BYTE; - MPI_File fh; - MPI_Request *req=NULL; -#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - double timing = MPI_Wtime(); + int i, j, err, mpireturn, status=NC_NOERR; + NC_lead_req *lead; + MPI_Aint addr; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *disps, *blens; +#else + MPI_Aint *disps; + int *blens; #endif + + if (num_reqs == 0) { + buf_view->type = MPI_BYTE; + buf_view->count = 0; + return NC_NOERR; + } + #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count bufLen; - MPI_Type_size_c(bufType, &bufLen); + disps = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs); + blens = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs); #else - int bufLen; - MPI_Type_size(bufType, &bufLen); + disps = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * num_reqs); + blens = (int*) NCI_Malloc(sizeof(int) * num_reqs); #endif - bufLen *= bufCount; - /* First, tell aggregator how much to receive by sending: - * (num_pairs and bufLen). The message size to be sent by this rank - * is num_pairs * 2 * sizeof(MPI_Offset) + bufLen - */ - if (ncp->rank == ncp->my_aggr) - msg = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * ncp->num_nonaggrs * 2); - else - msg = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * 2); + /* set disps[0] and blens[0] */ + MPI_Get_address(reqs[0].xbuf, &addr); /* displacement uses MPI_BOTTOM */ + disps[0] = addr; - msg[0] = num_pairs; - msg[1] = bufLen; + lead = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list + : ncp->get_lead_list; - /* Aggregator collects each non-aggregator's num_pairs and bufLen */ - if (ncp->rank == ncp->my_aggr) { - req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs); - nreqs = 0; - for (i=1; inum_nonaggrs; i++) - MPI_Irecv(msg + i*2, 2, MPI_AINT, ncp->nonaggr_ranks[i], 0, - ncp->comm, &req[nreqs++]); + /* blens[] are in bytes */ + blens[0] = reqs[0].nelems * lead[reqs[0].lead_off].varp->xsz; + *buf = lead[reqs[0].lead_off].xbuf; - mpireturn = MPI_Waitall(nreqs, req, MPI_STATUSES_IGNORE); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; + for (i=0, j=1; jxsz; + + /* coalesce the disps-blens pairs */ + if (disps[i] + blens[i] == disps[j]) + blens[i] += blens[j]; + else { + i++; + if (i < j) { + disps[i] = disps[j]; + blens[i] = blens[j]; + } } } - else { /* non-aggregator */ - MPI_Send(msg, 2, MPI_AINT, ncp->my_aggr, 0, ncp->comm); - if (num_pairs == 0) - NCI_Free(msg); + + if (i + 1 < num_reqs) { + num_reqs = i + 1; +#ifdef HAVE_MPI_LARGE_COUNT + disps = (MPI_Count*)NCI_Realloc(disps, sizeof(MPI_Count) * num_reqs); + blens = (MPI_Count*)NCI_Realloc(blens, sizeof(MPI_Count) * num_reqs); +#else + disps = (MPI_Aint*) NCI_Realloc(disps, sizeof(MPI_Aint) * num_reqs); + blens = (int*) NCI_Realloc(blens, sizeof(int) * num_reqs); +#endif } - /* Aggregator collects offset-length pairs from non-aggregators */ - if (ncp->rank == ncp->my_aggr) { - /* calculate the total number of offset-length pairs */ - npairs = num_pairs; - for (i=1; inum_nonaggrs; i++) npairs += msg[i*2]; + buf_view->count = num_reqs; + buf_view->off = disps; + buf_view->len = blens; +/* TODO: below datatype construction moves into ncmpio_read_write() */ + if (num_reqs == 1) { +#if 1 +buf_view->count = blens[0]; +#endif + buf_view->type = MPI_BYTE; + } + else { +#if 1 + /* construct buffer derived datatype */ #ifdef HAVE_MPI_LARGE_COUNT - if (npairs > num_pairs) { - /* realloc to store all pairs in a contiguous buffer */ - offsets = (MPI_Count*) NCI_Realloc(offsets, sizeof(MPI_Count) * npairs); - lengths = (MPI_Count*) NCI_Realloc(lengths, sizeof(MPI_Count) * npairs); - } + mpireturn = MPI_Type_create_hindexed_c(num_reqs, blens, disps, + MPI_BYTE, &buf_view->type); #else - if (npairs > num_pairs) { - /* realloc to store all pairs in a contiguous buffer */ - offsets = (MPI_Aint*) NCI_Realloc(offsets, sizeof(MPI_Aint) * npairs); - lengths = (int*) NCI_Realloc(lengths, sizeof(int) * npairs); - } + mpireturn = MPI_Type_create_hindexed(num_reqs, blens, disps, + MPI_BYTE, &buf_view->type); #endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; - nreqs = 0; -#ifdef HAVE_MPI_LARGE_COUNT - MPI_Aint aint; - MPI_Count bklens[2]; - MPI_Count disps[2]; + buf_view->type = MPI_BYTE; + buf_view->count = 0; + } + else { + MPI_Type_commit(&buf_view->type); +buf_view->count = 1; + } +#endif + *buf = NULL; /* buf_view->type is constructed using MPI_BOTTOM */ + } - MPI_Get_address(offsets, &aint); - disps[0] = MPI_Aint_add(aint, sizeof(MPI_Count) * msg[0]); - MPI_Get_address(lengths, &aint); - disps[1] = MPI_Aint_add(aint, sizeof(MPI_Count) * msg[0]); +#if 1 + NCI_Free(blens); + NCI_Free(disps); +#endif + return status; +} +#endif + +/*----< ina_collect_md() >---------------------------------------------------*/ +/* Within each intra-node aggregation group, the aggregator collects request + * metadata from the non-aggregators into meta, including: + * 1. the number of offset-length pairs on each non-aggregator + * 2. offsets array of each non-aggregator + * 3. lengths array of each non-aggregator + * 4. npairs is the total number of offset-length pairs of this group. + */ +static +int ina_collect_md(NC *ncp, + MPI_Aint *meta, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count **offsets, /* OUT: may be realloc-ed */ + MPI_Count **lengths, /* OUT: may be realloc-ed */ +#else + MPI_Offset **offsets, /* OUT: may be realloc-ed */ + int **lengths, /* OUT: may be realloc-ed */ +#endif + MPI_Aint *npairs) /* OUT: total no. off-len pairs */ +{ + int i, err, mpireturn, status=NC_NOERR, nreqs; + MPI_Request *req=NULL; + MPI_Aint num_pairs=meta[0]; + + /* Aggregator collects each non-aggregator's num_pairs and bufLen */ + if (ncp->my_aggr == ncp->rank) { + + req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs); + nreqs = 0; + for (i=1; inum_nonaggrs; i++) + TRACE_COMM(MPI_Irecv)(meta + i*3, 3, MPI_AINT, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); + + if (nreqs > 0) { +#ifdef HAVE_MPI_STATUSES_IGNORE + TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + TRACE_COMM(MPI_Waitall)(nreqs, req, statuses); + NCI_Free(statuses); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; + } + } + } + else /* non-aggregator */ + TRACE_COMM(MPI_Send)(meta, 3, MPI_AINT, ncp->my_aggr, 0, ncp->comm); + + /* Secondly, aggregators collect offset-length pairs from all its + * non-aggregators + */ + if (ncp->my_aggr == ncp->rank) { + MPI_Datatype recvType; + + /* calculate the total number of offset-length pairs to receive */ + for (*npairs=0, i=0; inum_nonaggrs; i++) *npairs += meta[i*3]; + + /* offsets and lengths have been allocated for storing this rank's + * offsets and lengths, realloc them to receive offsets and lengths + * from non-aggregators so they can be in a contiguous buffer. + */ +#ifdef HAVE_MPI_LARGE_COUNT + if (*npairs > num_pairs) { + *offsets = (MPI_Count*) NCI_Realloc(*offsets, *npairs * sizeof(MPI_Count)); + *lengths = (MPI_Count*) NCI_Realloc(*lengths, *npairs * sizeof(MPI_Count)); + } +#else + if (*npairs > num_pairs) { + /* realloc to store all pairs in a contiguous buffer */ + *offsets = (MPI_Offset*) NCI_Realloc(*offsets, *npairs * sizeof(MPI_Offset)); + *lengths = (int*) NCI_Realloc(*lengths, *npairs * sizeof(int)); + } +#endif + + /* To minimize number of MPI recv calls per non-aggregator, below + * creates a derived datatype, recvType, to combine offsets and lengths + * into one MPI_Irecv call. + */ + nreqs = 0; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Aint aint; + MPI_Count bklens[2]; + MPI_Count disps[2]; + + MPI_Get_address(*offsets, &aint); + disps[0] = MPI_Aint_add(aint, sizeof(MPI_Count) * meta[0]); + MPI_Get_address(*lengths, &aint); + disps[1] = MPI_Aint_add(aint, sizeof(MPI_Count) * meta[0]); for (i=1; inum_nonaggrs; i++) { - if (msg[i*2] == 0) continue; - bklens[0] = msg[i*2] * sizeof(MPI_Count); - bklens[1] = msg[i*2] * sizeof(MPI_Count); + if (meta[i*3] == 0) continue; + bklens[0] = meta[i*3] * sizeof(MPI_Count); + bklens[1] = meta[i*3] * sizeof(MPI_Count); mpireturn = MPI_Type_create_hindexed_c(2, bklens, disps, MPI_BYTE, - &recvTypes); + &recvType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed_c"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } else { - mpireturn = MPI_Type_commit(&recvTypes); + mpireturn = MPI_Type_commit(&recvType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); /* return the first encountered error if there is any */ @@ -926,35 +1424,34 @@ intra_node_aggregation(NC *ncp, } } /* post to receive offset-length pairs from non-aggregators */ - MPI_Irecv_c(MPI_BOTTOM, 1, recvTypes, ncp->nonaggr_ranks[i], - 0, ncp->comm, &req[nreqs]); - MPI_Type_free(&recvTypes); + TRACE_COMM(MPI_Irecv_c)(MPI_BOTTOM, 1, recvType, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); + MPI_Type_free(&recvType); disps[0] = MPI_Aint_add(disps[0], bklens[0]); disps[1] = MPI_Aint_add(disps[1], bklens[1]); - nreqs++; } #else int bklens[2]; MPI_Aint aint, disps[2]; - MPI_Get_address(offsets, &aint); - disps[0] = MPI_Aint_add(aint, sizeof(MPI_Aint) * msg[0]); - MPI_Get_address(lengths, &aint); - disps[1] = MPI_Aint_add(aint, sizeof(int) * msg[0]); + MPI_Get_address(*offsets, &aint); + disps[0] = MPI_Aint_add(aint, sizeof(MPI_Offset) * meta[0]); + MPI_Get_address(*lengths, &aint); + disps[1] = MPI_Aint_add(aint, sizeof(int) * meta[0]); for (i=1; inum_nonaggrs; i++) { - if (msg[i*2] == 0) continue; - bklens[0] = msg[i*2] * sizeof(MPI_Aint); - bklens[1] = msg[i*2] * sizeof(int); + if (meta[i*3] == 0) continue; + bklens[0] = meta[i*3] * sizeof(MPI_Offset); + bklens[1] = meta[i*3] * sizeof(int); mpireturn = MPI_Type_create_hindexed(2, bklens, disps, MPI_BYTE, - &recvTypes); + &recvType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } else { - mpireturn = MPI_Type_commit(&recvTypes); + mpireturn = MPI_Type_commit(&recvType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); /* return the first encountered error if there is any */ @@ -962,368 +1459,1369 @@ intra_node_aggregation(NC *ncp, } } /* post to receive offset-length pairs from non-aggregators */ - MPI_Irecv(MPI_BOTTOM, 1, recvTypes, ncp->nonaggr_ranks[i], - 0, ncp->comm, &req[nreqs]); - MPI_Type_free(&recvTypes); + TRACE_COMM(MPI_Irecv)(MPI_BOTTOM, 1, recvType, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); + MPI_Type_free(&recvType); disps[0] = MPI_Aint_add(disps[0], bklens[0]); disps[1] = MPI_Aint_add(disps[1], bklens[1]); - nreqs++; } #endif - mpireturn = MPI_Waitall(nreqs, req, MPI_STATUSES_IGNORE); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; + if (nreqs > 0) { +#ifdef HAVE_MPI_STATUSES_IGNORE + TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + TRACE_COMM(MPI_Waitall)(nreqs, req, statuses); + NCI_Free(statuses); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; + } } + NCI_Free(req); } else if (num_pairs > 0) { /* non-aggregator */ - /* send offset-length pairs data to the aggregator */ + /* To minimize number of MPI send calls to the aggregator, below + * creates a derived datatype, sendType, to combine offsets and lengths + * into one MPI_Send call. + */ + MPI_Datatype sendType; + #ifdef HAVE_MPI_LARGE_COUNT MPI_Aint aint; MPI_Count bklens[2]; MPI_Count disps[2]; - bklens[0] = msg[0] * sizeof(MPI_Count); + bklens[0] = meta[0] * sizeof(MPI_Count); bklens[1] = bklens[0]; - MPI_Get_address(offsets, &aint); + MPI_Get_address(*offsets, &aint); disps[0] = aint; - MPI_Get_address(lengths, &aint); + MPI_Get_address(*lengths, &aint); disps[1] = aint; mpireturn = MPI_Type_create_hindexed_c(2, bklens, disps, MPI_BYTE, - &recvTypes); + &sendType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed_c"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } else { - mpireturn = MPI_Type_commit(&recvTypes); + mpireturn = MPI_Type_commit(&sendType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } } - MPI_Send_c(MPI_BOTTOM, 1, recvTypes, ncp->my_aggr, 0, ncp->comm); - MPI_Type_free(&recvTypes); + TRACE_COMM(MPI_Send_c)(MPI_BOTTOM, 1, sendType, ncp->my_aggr, 0, + ncp->comm); + MPI_Type_free(&sendType); #else int bklens[2]; MPI_Aint disps[2]; - bklens[0] = msg[0] * sizeof(MPI_Aint); - bklens[1] = msg[0] * sizeof(int); - MPI_Get_address(offsets, &disps[0]); - MPI_Get_address(lengths, &disps[1]); + bklens[0] = meta[0] * sizeof(MPI_Aint); + bklens[1] = meta[0] * sizeof(int); + MPI_Get_address(*offsets, &disps[0]); + MPI_Get_address(*lengths, &disps[1]); mpireturn = MPI_Type_create_hindexed(2, bklens, disps, MPI_BYTE, - &recvTypes); + &sendType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } else { - mpireturn = MPI_Type_commit(&recvTypes); + mpireturn = MPI_Type_commit(&sendType); if (mpireturn != MPI_SUCCESS) { err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } } - MPI_Send(MPI_BOTTOM, 1, recvTypes, ncp->my_aggr, 0, ncp->comm); - MPI_Type_free(&recvTypes); + TRACE_COMM(MPI_Send)(MPI_BOTTOM, 1, sendType, ncp->my_aggr, 0, + ncp->comm); + MPI_Type_free(&sendType); #endif - NCI_Free(msg); } - /* - * TODO, define a datatype to combine sends of offset-length pairs with the - * write data into a single send call. - */ - nreqs = 0; - if (ncp->rank == ncp->my_aggr) { - /* calculate the total write account */ - buf_count = bufLen; - for (i=1; inum_nonaggrs; i++) buf_count += msg[i*2 + 1]; - - /* Allocate receive buffer, which will be sorted into an increasing - * order based on the file offsets. Thus, after sorting pack recv_buf - * to wr_buf to avoid creating another buffer datatype. - */ - if (buf_count > 0) { - recv_buf = (char*) NCI_Malloc(buf_count); - wr_buf = (char*) NCI_Malloc(buf_count); - } + return status; +} - /* First, pack self write data into front of the recv_buf */ - if (bufLen > 0) { - if (bufType == MPI_BYTE) - memcpy(recv_buf, buf, bufLen); - else { - void *inbuf = (buf == NULL) ? MPI_BOTTOM : buf; +/*----< ina_put() >----------------------------------------------------------*/ +/* This subroutine implements the intra-node aggregation for write operations. + */ +static +int ina_put(NC *ncp, + int is_incr, /* if offsets are incremental */ + MPI_Aint num_pairs, /* number of offset-length pairs */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count position=0; - MPI_Count incount = (buf == NULL) ? 1 : bufCount; - MPI_Pack_c(inbuf, incount, bufType, recv_buf, bufLen, &position, - MPI_COMM_SELF); + MPI_Count *offsets, + MPI_Count *lengths, #else - int position=0; - int incount = (buf == NULL) ? 1 : bufCount; - MPI_Pack(inbuf, incount, bufType, recv_buf, bufLen, &position, - MPI_COMM_SELF); + MPI_Offset *offsets, + int *lengths, #endif - } + PNCIO_View buf_view, + void *buf) /* user buffer */ +{ + int i, j, err, mpireturn, status=NC_NOERR; + char *recv_buf=NULL, *wr_buf = NULL; + MPI_Aint npairs=0, *meta=NULL, *count=NULL; + MPI_Offset wr_amnt=0; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *off_ptr, *len_ptr; +#else + MPI_Offset *off_ptr; + int *len_ptr; +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double endT, startT = MPI_Wtime(); + MPI_Offset mem_max; + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_put[0] = MAX(ncp->maxmem_put[0], mem_max); +#endif + + /* buf may be noncontiguous ! */ + + /* Firstly, aggregators collect metadata from non-aggregators. + * + * This rank tells its aggregator how much metadata to receive from this + * rank, by sending: the number of offset-length pairs (num_pairs) and user + * buffer size in bytes (buf_view.size). This message size to be sent by + * this rank is 3 MPI_Offset. + */ + if (ncp->rank == ncp->my_aggr) + meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * ncp->num_nonaggrs * 3); + else + meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * 3); + + meta[0] = num_pairs; + meta[1] = buf_view.size; + meta[2] = is_incr; + + /* Each aggregator first collects metadata about its offset-length pairs, + * size of write request, and whether the offsets are in an incremental + * order. The aggregator will gather these metadata from non-aggregators + * assigned to it. + * For write operation, keeping the original offset-length pairs is not + * necessary, as they will later be sorted and coalesced before calling + * MPI-IO or PNCIO file write. + * + * Once ina_collect_md() returns, this aggregator's offsets and lengths may + * grow to include the ones from non-aggregators (appended). + */ + if (ncp->num_nonaggrs > 1) { + err = ina_collect_md(ncp, meta, &offsets, &lengths, &npairs); + if (err != NC_NOERR) { + NCI_Free(meta); + return err; } + } + else + npairs = num_pairs; - /* post requests to receive write data from non-aggregators */ - if (buf_count > 0) { - char *ptr = recv_buf + bufLen; - for (i=1; inum_nonaggrs; i++) { - if (msg[i*2 + 1] == 0) continue; + /* For write operation, the non-aggregators now can start sending their + * write data to the aggregator. + */ + if (ncp->rank != ncp->my_aggr) { /* non-aggregator */ + if (meta[0] > 0) { + /* Non-aggregators send write data to the aggregator */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Irecv_c(ptr, msg[i*2 + 1], MPI_BYTE, ncp->nonaggr_ranks[i], - 0, ncp->comm, &req[nreqs++]); + MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1; + TRACE_COMM(MPI_Send_c)(buf, num, buf_view.type, ncp->my_aggr, + 0, ncp->comm); #else - MPI_Irecv(ptr, msg[i*2 + 1], MPI_BYTE, ncp->nonaggr_ranks[i], - 0, ncp->comm, &req[nreqs++]); + int num = (buf_view.is_contig) ? buf_view.size : 1; + TRACE_COMM(MPI_Send)(buf, num, buf_view.type, ncp->my_aggr, + 0, ncp->comm); #endif - ptr += msg[i*2 + 1]; - } - mpireturn = MPI_Waitall(nreqs, req, MPI_STATUSES_IGNORE); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; - } } - NCI_Free(req); - NCI_Free(msg); + + /* Must free offsets and lengths now, as they may be realloc-ed in + * ina_collect_md() + */ + if (offsets != NULL) NCI_Free(offsets); + if (lengths != NULL) NCI_Free(lengths); + + /* Non-aggregators are done here, as only aggregators call MPI-IO/PNCIO + * functions to write data to the file. Non-aggregators do not + * participate MPI-IO calls. + */ + NCI_Free(meta); + return status; } - else if (bufLen > 0) { - /* send write data to the aggregator */ - void *buf_ptr = (buf == NULL) ? MPI_BOTTOM : buf; + + /* The remaining of this subroutine is for aggregators only */ + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_put[1] = MAX(ncp->maxmem_put[1], mem_max); + endT = MPI_Wtime(); + if (ncp->rank == ncp->my_aggr) ncp->ina_time_put[0] += endT - startT; + startT = endT; +#endif + + off_ptr = offsets; + len_ptr = lengths; + + /* MPI-IO has the following requirements about filetype. + * 1. The (flattened) displacements (of a filetype) are not required to be + * distinct, but they cannot be negative, and they must be monotonically + * non-decreasing. + * 2. If the file is opened for writing, neither the etype nor the filetype + * is permitted to contain overlapping regions. + */ + if (npairs > 0) { + /* Now this aggregator has received all offset-length pairs from its + * non-aggregators. At first, check if a sorting is necessary. + */ + char *ptr; + int nreqs, indv_sorted, do_sort, overlap; + MPI_Request *req=NULL; + MPI_Offset recv_amnt; + + /* check if offsets of all non-aggregators are individual sorted */ + indv_sorted = 1; + do_sort = 0; + for (i=-1,j=0; jnum_nonaggrs; j++) { + if (i == -1 && meta[j*3] > 0) /* find 1st whose num_pairs > 0 */ + i = j; + if (meta[j*3+2] == 0) { /* j's offsets are not sorted */ + indv_sorted = 0; + do_sort = 1; + break; + } + } + /* i is the first non-aggregator whose num_pairs > 0, and + * j is the first non-aggregator whose is_incr is false + */ +// printf("%s at %d: do_sort=%d indv_sorted=%d\n",__func__,__LINE__, do_sort,indv_sorted); + + if (i >= 0 && indv_sorted == 1) { + /* When all ranks' offsets are individually sorted, we still need + * to check if offsets are interleaved among all non-aggregators to + * determine whether a sort for all offset-length pairs is + * necessary. + */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count num = (buf == NULL) ? 1 : bufCount; - MPI_Send_c(buf_ptr, num, bufType, ncp->my_aggr, 0, ncp->comm); + MPI_Count prev_end_off; #else - int num = (buf == NULL) ? 1 : bufCount; - MPI_Send(buf_ptr, num, bufType, ncp->my_aggr, 0, ncp->comm); + MPI_Offset prev_end_off; #endif - NCI_Free(offsets); - NCI_Free(lengths); - } + assert(meta[i*3+2] == 1); + + MPI_Aint sum = meta[i*3]; + prev_end_off = off_ptr[sum-1]; /* last offset of non-aggregator i */ - /* aggregator sorts the offset-length pairs, along with the buffer */ - if (ncp->rank == ncp->my_aggr && npairs > 0) { + /* check if the offsets are interleaved */ + for (++i; inum_nonaggrs; i++) { + if (meta[i*3] == 0) /* zero-sized request */ + continue; + assert(meta[i*3+2] == 1); - /* construct array of buffer addresses */ + if (prev_end_off > off_ptr[sum]) { + /* off_ptr[sum] is the non-aggregator i' 1st offset */ + do_sort = 1; /* offsets are not incrementing */ + break; + } + /* move on to next non-aggregator */ + sum += meta[i*3]; + prev_end_off = off_ptr[sum-1]; + } + } + + if (do_sort && indv_sorted) { + /* Interleaved offsets are found but individual offsets are already + * sorted. In this case, heap_merge() is called to merge all + * offsets into one single sorted offset list. Note count[] is + * initialized and will be used in heap_merge() + */ + count = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) *ncp->num_nonaggrs); + for (i=0; inum_nonaggrs; i++) count[i] = meta[i*3]; + } + + /* Construct an array of buffer addresses containing a mapping of the + * buffer used to receive write data from non-aggregators and the + * buffer used to write to file. bufAddr[] is calculated based on the + * assumption that the write buffer is contiguous. + */ MPI_Aint *bufAddr = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint) * npairs); bufAddr[0] = 0; for (i=1; inum_nonaggrs, count, npairs, off_ptr, len_ptr, + bufAddr); + NCI_Free(count); + } + else + /* When some individual offsets are not sorted, we cannot use + * heap_merge(). Note qsort() is an in-place sorting. + */ + qsort_off_len_buf(npairs, off_ptr, len_ptr, bufAddr); + } +// printf("%s at %d: do_sort=%d indv_sorted=%d\n",__func__,__LINE__, do_sort,indv_sorted); - /* merge the overlapped buffer segments, skip the overlapped regions - * for those with higher j indices (i.e. requests with lower j indices - * win the writes to the overlapped regions) + /* Now off_ptr and len_ptr are sorted, but overlaps may exist between + * adjacent pairs. If this is the case, they must be coalesced. + * + * Below loop checks if there is overlap and calculates recv_amnt and + * wr_amnt. + * recv_amnt is the total amount this aggregator will receive from + * non-aggregators, including self. recv_amnt includes overlaps. + * wr_amnt is recv_amnt with overlap removed. + * + * This loop also coalesces offset-length pairs as well as the + * corresponding buffer addresses, so they can be used to move write + * data around in the true write buffer. */ + overlap = 0; +int fake_overlap=0; + wr_amnt = recv_amnt = len_ptr[0]; for (i=0, j=1; j= offsets[j] + lengths[j]) + recv_amnt += len_ptr[j]; + if (off_ptr[i] + len_ptr[i] >= off_ptr[j] + len_ptr[j]) { + overlap = 1; +fake_overlap=1; /* segment i completely covers segment j, skip j */ continue; + } - MPI_Offset gap = offsets[i] + lengths[i] - offsets[j]; - if (gap >= 0) { /* segments i and j overlaps */ - if (bufAddr[i] + lengths[i] == bufAddr[j] + gap) { - /* buffers i and j are contiguous, merge j to i */ - lengths[i] += lengths[j] - gap; + MPI_Offset gap = off_ptr[i] + len_ptr[i] - off_ptr[j]; + if (gap >= 0) { /* overlap detected, merge j into i */ + /* when gap > 0, pairs i and j overlap + * when gap == 0, pairs i and j are contiguous + */ + if (gap > 0) overlap = 1; +if (gap >= 0) fake_overlap=1; + wr_amnt += len_ptr[j] - gap; + if (bufAddr[i] + len_ptr[i] == bufAddr[j] + gap) { + /* buffers i and j are contiguous, merge j into i */ + len_ptr[i] += len_ptr[j] - gap; } else { /* buffers are not contiguous, reduce j's len */ - offsets[i+1] = offsets[j] + gap; - lengths[i+1] = lengths[j] - gap; + off_ptr[i+1] = off_ptr[j] + gap; + len_ptr[i+1] = len_ptr[j] - gap; bufAddr[i+1] = bufAddr[j] + gap; i++; } } else { /* i and j do not overlap */ + wr_amnt += len_ptr[j]; i++; if (i < j) { - offsets[i] = offsets[j]; - lengths[i] = lengths[j]; + off_ptr[i] = off_ptr[j]; + len_ptr[i] = len_ptr[j]; bufAddr[i] = bufAddr[j]; } } } - /* update number of pairs, now all off-len pairs are not overlapped */ +/* +if (ncp->num_nonaggrs == 1 && do_sort == 1) printf("%s at %d: overlap=%d do_sort=%d after coalesce npairs changed from %ld to %d wr_amnt=%lld recv_amnt=%lld\n",__func__,__LINE__, overlap, do_sort,npairs,i+1,wr_amnt,recv_amnt); +*/ + +if (fake_overlap == 0) assert(npairs == i+1); + + /* Now off_ptr[], len_ptr[], bufAddr[] are coalesced and no overlap */ npairs = i+1; - /* pack recv_buf, data received from non-aggregators, into wr_buf, a - * contiguous buffer, wr_buf, which will later be used in a call to - * MPI_File_write_at_all() +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_put[2] = MAX(ncp->maxmem_put[2], mem_max); + + endT = MPI_Wtime(); + ncp->ina_time_put[1] += endT - startT; + ncp->ina_npairs_put = MAX(ncp->ina_npairs_put, npairs); + startT = endT; +#endif + + /* Allocate receive buffer. Once write data from non-aggregators have + * received into recv_buf, it is packed into wr_buf. Then, wr_buf is + * used to call MPI-IO/PNCIO file write. Note the wr_buf is always + * contiguous. + * + * When ncp->num_nonaggrs == 1, wr_buf is set to buf which is directly + * passed to MPI-IO/PNCIO file write. + * + * If file offset-length pairs have not been re-ordered, i.e. sorted + * and overlaps removed, and this aggregator will not receive any write + * data from its non-aggregators, then we can use user's buffer, buf, + * to call MPI-IO/PNCIO to write to the file, without allocating an + * additional temporary buffer. + */ + if (!do_sort && buf_view.size == recv_amnt && !overlap) + recv_buf = buf; + else + recv_buf = (char*) NCI_Malloc(recv_amnt); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_put[3] = MAX(ncp->maxmem_put[3], mem_max); +#endif + + if (recv_buf != buf) { + /* Pack this aggregator's write data into front of recv_buf */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count pos=0; + MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1; + MPI_Pack_c(buf, num, buf_view.type, recv_buf, buf_view.size, &pos, + MPI_COMM_SELF); +#else + int pos=0; + MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1; + MPI_Pack(buf, num, buf_view.type, recv_buf, buf_view.size, &pos, + MPI_COMM_SELF); +#endif + } + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + ncp->ina_time_put[2] += endT - startT; + startT = endT; +#endif + + /* Receive write data sent from non-aggregators. Note we cannot move + * the posting of MPI_Irecv calls to before sorting and leave + * MPI_Waitall() to after sorting to overlap communication with the + * sorting, because the sorting determines the receive buffer size. */ - char *ptr = wr_buf; - buf_count = 0; - if (npairs > 0) { - memcpy(ptr, recv_buf + bufAddr[0], lengths[0]); - ptr += lengths[0]; - buf_count = lengths[0]; + req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs); + ptr = recv_buf + buf_view.size; + nreqs = 0; + for (i=1; inum_nonaggrs; i++) { + if (meta[i*3 + 1] == 0) continue; +#ifdef HAVE_MPI_LARGE_COUNT + TRACE_COMM(MPI_Irecv_c)(ptr, meta[i*3 + 1], MPI_BYTE, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); +#else + TRACE_COMM(MPI_Irecv)(ptr, meta[i*3 + 1], MPI_BYTE, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); +#endif + ptr += meta[i*3 + 1]; } - for (i=0, j=1; j 0) { +#ifdef HAVE_MPI_STATUSES_IGNORE + TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + TRACE_COMM(MPI_Waitall)(nreqs, req, statuses); + NCI_Free(statuses); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; } - else { - i++; - if (i < j) { - offsets[i] = offsets[j]; - lengths[i] = lengths[j]; - } + } + NCI_Free(req); + + /* Now all write data has been collected into recv_buf. In case of any + * overlap, we must coalesce recv_buf into wr_buf using off_ptr[], + * len_ptr[], and bufAddr[]. For overlapped regions, requests with + * lower j indices win the writes to the overlapped regions. + * + * In case the user buffer, buf, can not be used to write to the file, + * loop below packs recv_buf, data received from non-aggregators, into + * wr_buf, a contiguous buffer, wr_buf, which will later be used in a + * call to MPI-IO/PNCIO file write. + */ + if (!do_sort && wr_amnt == recv_amnt) + wr_buf = recv_buf; + else { + /* do_sort means buffer's offsets and lengths have been moved + * around in order to make file offset-length pairs monotonically + * non-decreasing. We need to copy write data into a temporary + * buffer, wr_buf, and write it to the file. + */ + wr_buf = NCI_Malloc(wr_amnt); + ptr = wr_buf; + + for (j=0; j 0) */ - /* update number of pairs, now all off-len pairs are not overlapped */ - npairs = i+1; + NCI_Free(meta); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + if (ncp->rank == ncp->my_aggr) ncp->ina_time_put[3] += endT - startT; +#endif + + /* set the fileview */ + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, npairs, off_ptr, len_ptr); + if (err != NC_NOERR) { + if (status == NC_NOERR) status = err; + wr_amnt = 0; + } + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_put[4] = MAX(ncp->maxmem_put[4], mem_max); +#endif + + if (wr_buf != buf) { + /* If write data has been packed in wr_buf, a contiguous buffer, + * buf_view must be updated before passing it to the MPI-IO/PNCIO file + * write. + */ + buf_view.size = wr_amnt; + buf_view.type = MPI_BYTE; + buf_view.is_contig = 1; + } + /* else case is when the user's buffer, buf, can be used to write */ + + /* carry out write request to file */ + err = ncmpio_read_write(ncp, NC_REQ_WR, 0, buf_view, wr_buf); + if (status == NC_NOERR) status = err; + + if (wr_buf != buf) NCI_Free(wr_buf); + + /* Must free offsets and lengths now, as they may be realloc-ed in + * ina_collect_md() + */ + if (offsets != NULL) NCI_Free(offsets); + if (lengths != NULL) NCI_Free(lengths); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_put[5] = MAX(ncp->maxmem_put[5], mem_max); +#endif + + return status; +} + +static +size_t bin_search( +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count key, MPI_Count *base, +#else + MPI_Offset key, MPI_Offset *base, +#endif + size_t nmemb) +{ + size_t low, high; + + /* only one element */ + if (nmemb == 1) + return (base[0] <= key) ? 0 : -1; + + /* check the 1st element */ + if (base[0] <= key && key < base[1]) + return 0; + + low = 1; + high = nmemb - 1; + + while (low <= high) { + size_t mid = low + (high - low) / 2; + if (base[mid] == key) + return mid; + if (base[mid] < key) + low = mid + 1; + else + high = mid - 1; + } + return (low - 1); +} + +/*----< ina_get() >----------------------------------------------------------*/ +/* This subroutine implements the intra-node aggregation for read operations. + */ +static +int ina_get(NC *ncp, + int is_incr, /* if offsets are incremental */ + MPI_Aint num_pairs, /* number of offset-length pairs */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *offsets, + MPI_Count *lengths, +#else + MPI_Offset *offsets, + int *lengths, +#endif + PNCIO_View buf_view, + void *buf) /* user buffer */ +{ + int i, j, err, mpireturn, status=NC_NOERR, nreqs; + int do_sort=0, indv_sorted=1, overlap=0; + char *rd_buf = NULL; + MPI_Aint npairs=0, max_npairs, *meta=NULL, *count=NULL; + MPI_Offset send_amnt=0, rd_amnt=0, off_start; + MPI_Request *req=NULL; + PNCIO_View rd_buf_view; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *off_ptr, *len_ptr, *orig_off_ptr, *orig_len_ptr; + MPI_Count bufLen, *orig_offsets=NULL, *orig_lengths=NULL; + MPI_Count *blks = NULL, *disps = NULL; +#else + MPI_Offset *orig_offsets=NULL, *orig_off_ptr, *off_ptr; + int bufLen, *orig_lengths=NULL, *orig_len_ptr, *len_ptr, *blks = NULL; + MPI_Aint *disps = NULL; +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double endT, startT = MPI_Wtime(); + MPI_Offset mem_max; + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_get[0] = MAX(ncp->maxmem_get[0], mem_max); +#endif - if (npairs == 1) { - /* No need to create fileType if writing to a contiguous space */ - offset = offsets[0]; + bufLen = buf_view.size; + + /* Firstly, aggregators collect metadata from non-aggregators. + * + * This rank tells its aggregator how much metadata to receive from this + * rank, by sending + * 1. the number of offset-length pairs (num_pairs) + * 2. user buffer size in bytes (bufLen). + * 3. whether this rank's offsets are sorted in increasing order. + * This message size to be sent by this rank is 3 MPI_Offset. + */ + if (ncp->rank == ncp->my_aggr) + meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * ncp->num_nonaggrs * 3); + else + meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * 3); + + meta[0] = num_pairs; + meta[1] = bufLen; + meta[2] = is_incr; + + /* Each aggregator first collects metadata about its offset-length pairs, + * size of read request, and whether the offsets are in an incremental + * order. The aggregator will gather these metadata from non-aggregators + * assigned to it. + * + * Once ina_collect_md() returns, this aggregator's offsets and lengths may + * grow to include the ones from non-aggregators (appended). + */ + if (ncp->num_nonaggrs > 1) { + err = ina_collect_md(ncp, meta, &offsets, &lengths, &npairs); + if (err != NC_NOERR) { + NCI_Free(meta); + return err; } - else { + } + else + npairs = num_pairs; + + if (ncp->rank != ncp->my_aggr) { + if (meta[0] > 0) { + /* For read operation, the non-aggregators now can start receiving + * their read data from the aggregator. + */ + MPI_Status st; #ifdef HAVE_MPI_LARGE_COUNT - /* construct fileview */ - mpireturn = MPI_Type_create_hindexed_c(npairs, lengths, offsets, - MPI_BYTE, &fileType); + MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1; + TRACE_COMM(MPI_Recv_c)(buf, num, buf_view.type, ncp->my_aggr, 0, + ncp->comm, &st); +#else + int num = (buf_view.is_contig) ? buf_view.size : 1; + TRACE_COMM(MPI_Recv)(buf, num, buf_view.type, ncp->my_aggr, 0, + ncp->comm, &st); +#endif + } + + /* Must free offsets and lengths now, as they may be realloc-ed in + * ina_collect_md() + */ + if (offsets != NULL) NCI_Free(offsets); + if (lengths != NULL) NCI_Free(lengths); + /* Non-aggregators are now done, as they do not participate MPI-IO or + * PNCIO file read. + */ + NCI_Free(meta); + return status; + } + + /* The remaining of this subroutine is for aggregators only. */ + + /* For read operation, the original offsets and lengths must be kept + * untouched, because the later sorting and coalescing will mess up the + * original order of offsets and lengths, which are needed to construct a + * datatype when an aggregator sends read data to its non-aggregators. + */ +#ifdef HAVE_MPI_LARGE_COUNT + orig_offsets = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * npairs); + orig_lengths = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * npairs); + memcpy(orig_offsets, offsets, sizeof(MPI_Count) * npairs); + memcpy(orig_lengths, lengths, sizeof(MPI_Count) * npairs); #else - /* construct fileview */ - mpireturn = MPI_Type_create_hindexed(npairs, lengths, offsets, - MPI_BYTE, &fileType); + orig_offsets = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * npairs); + orig_lengths = (int*) NCI_Malloc(sizeof(int) * npairs); + memcpy(orig_offsets, offsets, sizeof(MPI_Offset) * npairs); + memcpy(orig_lengths, lengths, sizeof(int) * npairs); +#endif + orig_off_ptr = orig_offsets; + orig_len_ptr = orig_lengths; + off_ptr = offsets; + len_ptr = lengths; +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_get[1] = MAX(ncp->maxmem_get[1], mem_max); #endif - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; + + /* MPI-IO has the following requirements about filetype. + * 1. The (flattened) displacements (of a filetype) are not required to be + * distinct, but they cannot be negative, and they must be monotonically + * non-decreasing. + * 2. If the file is opened for writing, neither the etype nor the filetype + * is permitted to contain overlapping regions. + */ + if (npairs > 0) { + /* Now this aggregator has received all offset-length pairs from its + * non-aggregators. At first, check if a sorting is necessary. + */ + + /* check if offsets of all non-aggregators are individual sorted */ + indv_sorted = 1; + for (i=-1,j=0; jnum_nonaggrs; j++) { + if (i == -1 && meta[j*3] > 0) /* find 1st whose num_pairs > 0 */ + i = j; + if (meta[j*3+2] == 0) { /* j's offsets are not sorted */ + indv_sorted = 0; + do_sort = 1; + break; } - else { - mpireturn = MPI_Type_commit(&fileType); - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; + } + /* i is the first non-aggregator whose num_pairs > 0 + * j is the first non-aggregator whose is_incr is false + */ + + if (i >= 0 && indv_sorted == 1) { + /* When all ranks' offsets are individually sorted, we still need + * to check if offsets are interleaved among all non-aggregators to + * determine whether a sort for all offset-length pairs is + * necessary. + */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count prev_end_off; +#else + MPI_Offset prev_end_off; +#endif + assert(meta[i*3+2] == 1); + + MPI_Aint sum = meta[i*3]; + prev_end_off = off_ptr[sum-1]; /* last offset of non-aggregator i */ + + /* check if the offsets are interleaved */ + for (++i; inum_nonaggrs; i++) { + if (meta[i*3] == 0) /* zero-sized request */ + continue; + assert(meta[i*3+2] == 1); + if (prev_end_off > off_ptr[sum]) { + /* off_ptr[sum] is the non-aggregator i' 1st offset */ + do_sort = 1; /* offsets are not incrementing */ + break; } + /* move on to next non-aggregator */ + sum += meta[i*3]; + prev_end_off = off_ptr[sum-1]; } } - NCI_Free(offsets); - NCI_Free(lengths); - } + + if (do_sort && indv_sorted) { + /* Interleaved offsets are found but individual offsets are already + * sorted. In this case, heap_merge() is called to merge all + * offsets into one single sorted offset list. Note count[] is + * initialized and will be used in heap_merge() + */ + count = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint)* ncp->num_nonaggrs); + for (i=0; inum_nonaggrs; i++) count[i] = meta[i*3]; + } + + /* Construct an array of buffer addresses containing a mapping of the + * buffer used to receive write data from non-aggregators and the + * buffer used to write to file. + */ + if (do_sort) { + /* Sort offsets and lengths, based on offsets into an increasing + * order. + */ + if (indv_sorted) { + /* heap-merge() runs much faster than qsort() when individual + * lists have already been sorted. However, it has a much + * bigger memory footprint. + */ + heap_merge(ncp->num_nonaggrs, count, npairs, off_ptr, len_ptr, + NULL); + NCI_Free(count); + } + else + /* When some individual offsets are not sorted, we cannot use + * heap_merge(). Note qsort() is an in-place sorting. + */ + qsort_off_len_buf(npairs, off_ptr, len_ptr, NULL); + } #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - ncp->aggr_time += MPI_Wtime() - timing; + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_get[2] = MAX(ncp->maxmem_get[2], mem_max); + ncp->ina_npairs_get = MAX(ncp->ina_npairs_get, npairs); #endif - if (ncp->rank != ncp->my_aggr) /* non-aggregator writes nothing */ - buf_count = 0; + /* Coalesce the offset-length pairs and calculate the total read amount + * and send amount by this aggregator. + */ + overlap = 0; + send_amnt = rd_amnt = len_ptr[0]; + for (i=0, j=1; j= 0) { /* overlap detected, merge j into i */ + /* when gap > 0, pairs i and j overlap + * when gap == 0, pairs i and j are contiguous + */ + MPI_Offset i_end, j_end; + + if (gap > 0) overlap = 1; + + i_end = off_ptr[i] + len_ptr[i]; + j_end = off_ptr[j] + len_ptr[j]; + if (i_end < j_end) { + len_ptr[i] += j_end - i_end; + rd_amnt += j_end - i_end; + } + /* else: j is entirely covered by i */ + } + else { /* j and i are not overlapped */ + rd_amnt += len_ptr[j]; + i++; + if (i < j) { + off_ptr[i] = off_ptr[j]; + len_ptr[i] = len_ptr[j]; + } + } + } + + /* update npairs after coalesce */ + npairs = i+1; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_get[3] = MAX(ncp->maxmem_get[3], mem_max); +#endif + } /* if (npairs > 0) */ + /* else case: This aggregation group may not have data to read, but must + * participate the collective MPI-IO calls. */ - fh = ncp->collective_fh; - /* set the MPI-IO fileview, this is a collective call */ - err = ncmpio_file_set_view(ncp, fh, &offset, fileType); - if (fileType != MPI_BYTE) MPI_Type_free(&fileType); + /* set the fileview */ + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, npairs, off_ptr, len_ptr); if (err != NC_NOERR) { if (status == NC_NOERR) status = err; - buf_count = 0; + rd_amnt = 0; + } + + /* Allocate read buffer and send buffer. Once data are read from file into + * rd_buf, it is unpacked into send_buf for each non-aggregator. send_buf + * will be directly used to send the read request data to non-aggregators. + * + * Note rd_amnt may not be the same as send_amnt, as there can be overlaps + * between adjacent offset-length pairs after sorted. + * + * If file offset-length pairs have not been re-ordered, i.e. sorted and + * overlaps removed, and this aggregator will not send any read data to its + * non-aggregators, then we can use user's buffer, buf, to call + * MPI-IO/PNCIO to read from the file, without allocating an additional + * temporary buffer. + */ + if (!do_sort && buf_view.size == send_amnt && !overlap) { + rd_buf_view = buf_view; + rd_buf = buf; } + else { + /* Read data will be stored in a contiguous read buffer. */ + rd_buf_view.size = rd_amnt; + rd_buf_view.type = MPI_BYTE; + rd_buf_view.is_contig = 1; + if (rd_amnt > 0) + rd_buf = (char*) NCI_Malloc(rd_amnt); + } + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_get[4] = MAX(ncp->maxmem_get[4], mem_max); + endT = MPI_Wtime(); + ncp->ina_time_get[0] += endT - startT; +#endif - /* call MPI_File_write_at_all */ - err = ncmpio_read_write(ncp, NC_REQ_WR, NC_REQ_COLL, offset, buf_count, - MPI_BYTE, wr_buf, 1); + err = ncmpio_read_write(ncp, NC_REQ_RD, 0, rd_buf_view, rd_buf); if (status == NC_NOERR) status = err; - if (wr_buf != NULL) NCI_Free(wr_buf); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + ncmpi_inq_malloc_size(&mem_max); + ncp->maxmem_get[5] = MAX(ncp->maxmem_get[5], mem_max); + startT = MPI_Wtime(); +#endif + + /* If sorting has been performed, the orders of off_ptr[] and len_ptr[] may + * no longer be the same as the original ones. We must use binary search to + * find the aggregated offset-length pair containing each non-aggregator's + * offset-length pair to construct a send buffer datatype, a view layout to + * the read buffer, rd_buf, so the data can be directly sent from rd_buf. + */ + if (rd_buf != buf) { + /* First, aggregators copy the read data to their own user buffer. + * Note off_ptr[] is sorted in an incremental order. + * + * When the offset-length pairs of read buffer have been sorted or + * the read buffer size is smaller than the total get amount, we must + * search and copy from read buffer to self's user buffer. + */ + char *ptr=NULL, *tmp_buf=NULL; + size_t m=0, k, scan_off=0; + + /* If this aggregator's user buftype is contiguous, the reuse its + * read buffer. If not, allocate a temporary buffer, copy the read + * data over, and then unpacking it to the user buffer. + */ + if (buf_view.is_contig) + ptr = buf; + else if (bufLen > 0) + ptr = tmp_buf = (char*) NCI_Malloc(bufLen); + + for (j=0; j 0 && !buf_view.is_contig) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count pos=0; + MPI_Unpack_c(tmp_buf, bufLen, &pos, buf, 1, buf_view.type, + MPI_COMM_SELF); +#else + int pos=0; + MPI_Unpack(tmp_buf, bufLen, &pos, buf, 1, buf_view.type, + MPI_COMM_SELF); +#endif + NCI_Free(tmp_buf); + } + } + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + ncp->ina_time_get[1] += endT - startT; + startT = endT; +#endif + + if (ncp->num_nonaggrs == 1) + /* In this case, communication will not be necessary. */ + goto fn_exit; + + /* Aggregators start sending read data to non-aggregators. At first, + * allocate array_of_blocklengths[] and array_of_displacements[] + */ + for (max_npairs=0, i=1; inum_nonaggrs; i++) + max_npairs = MAX(meta[3*i], max_npairs); + +#ifdef HAVE_MPI_LARGE_COUNT + blks = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * max_npairs); + disps = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * max_npairs); +#else + blks = (int*) NCI_Malloc(sizeof(int) * max_npairs); + disps = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * max_npairs); +#endif + + /* Now, send data to each non-aggregator */ + req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs); + nreqs = 0; + off_start = meta[0]; + for (i=1; inum_nonaggrs; i++) { + /* populate disps[] and blks[] */ + MPI_Aint remote_num_pairs = meta[3*i]; + MPI_Aint remote_is_incr = meta[3*i+2]; + + if (remote_num_pairs == 0) continue; /* zero sized request */ + +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *off = orig_off_ptr + off_start; + MPI_Count *len = orig_len_ptr + off_start; +#else + MPI_Offset *off = orig_off_ptr + off_start; + int *len = orig_len_ptr + off_start; +#endif + size_t k, m = 0; + size_t scan_off = 0; + for (j=0; jnonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); +#else + TRACE_COMM(MPI_Isend)(MPI_BOTTOM, 1, sendType, + ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]); +#endif + MPI_Type_free(&sendType); + } + } +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + ncp->ina_time_get[2] += endT - startT; + startT = endT; +#endif + + if (nreqs > 0) { +#ifdef HAVE_MPI_STATUSES_IGNORE + TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + TRACE_COMM(MPI_Waitall)(nreqs, req, statuses); + NCI_Free(statuses); +#endif + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall"); + /* return the first encountered error if there is any */ + if (status == NC_NOERR) status = err; + } + } + NCI_Free(blks); + NCI_Free(disps); + +fn_exit: + /* offsets[] and lengths[] are used in PNCIO read subroutines as flattened + * filetype. They cannot be freed before the I/O is done. + */ + if (rd_buf != NULL && rd_buf != buf) NCI_Free(rd_buf); + if (orig_lengths != NULL) NCI_Free(orig_lengths); + if (orig_offsets != NULL) NCI_Free(orig_offsets); + if (req != NULL) NCI_Free(req); + if (meta != NULL) NCI_Free(meta); + + /* Must free offsets and lengths now, as they may be realloc-ed in + * ina_collect_md() + */ + if (offsets != NULL) NCI_Free(offsets); + if (lengths != NULL) NCI_Free(lengths); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + endT = MPI_Wtime(); + ncp->ina_time_get[3] += endT - startT; +#endif return status; } -/*----< ncmpio_intra_node_aggregation_nreqs() >------------------------------*/ -/* This is a collective call */ +/*----< req_compare() >------------------------------------------------------*/ +/* used to sort the the string file offsets of reqs[] */ +static int +req_compare(const void *a, const void *b) +{ + if (((NC_req*)a)->offset_start > ((NC_req*)b)->offset_start) return (1); + if (((NC_req*)a)->offset_start < ((NC_req*)b)->offset_start) return (-1); + return (0); +} + +/*----< ncmpio_ina_nreqs() >-------------------------------------------------*/ +/* This subroutine handles PnetCDF's requests made from non-blocking APIs, + * which contain multiple requests to one or more variable. The input arguments + * are described below. + * reqMode: NC_REQ_RD for read request and NC_REQ_WR for write. + * num_reqs: number of elements in array req_list. + * req_list[]: stores pending requests from non-blocking API calls, which is + * used to construct file offset-length pairs and user buffer + * datatype. + * newnumrecs: number of new records + */ int -ncmpio_intra_node_aggregation_nreqs(NC *ncp, - int reqMode, - int num_reqs, - NC_req *put_list, - MPI_Offset newnumrecs) +ncmpio_ina_nreqs(NC *ncp, + int reqMode, + int num_reqs, + NC_req *req_list, + MPI_Offset newnumrecs) { - int err, status=NC_NOERR; - MPI_Aint bufLen, num_pairs; + int err, status=NC_NOERR, is_incr=1; + void *buf=NULL; + MPI_Aint num_pairs; #ifdef HAVE_MPI_LARGE_COUNT MPI_Count *offsets=NULL, *lengths=NULL; #else - MPI_Aint *offsets=NULL; + MPI_Offset *offsets=NULL; int *lengths=NULL; #endif - MPI_Datatype bufType=MPI_BYTE; #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) double timing = MPI_Wtime(); #endif - /* currently supports write requests only */ - if (fIsSet(reqMode, NC_REQ_RD)) return NC_NOERR; +// printf("%s at %d: rank=%d num_aggrs_per_nod =%d my_aggr=%d num_nonaggrs=%d\n",__func__,__LINE__, ncp->rank, ncp->num_aggrs_per_node, ncp->my_aggr, ncp->num_nonaggrs); + + /* populate reqs[].offset_start, starting offset of each request */ + NC_req *reqs = req_list; + int i, descreasing=0; + for (i=0; iget_lead_list + : ncp->put_lead_list; + lead += reqs[i].lead_off; + varp = lead->varp; + + if (varp->ndims == 0) { /* scalar variable */ + reqs[i].offset_start += varp->begin; + } + else if (reqs[i].npairs == 1) { /* only one offset-length pair */ + MPI_Offset off = varp->begin; + + if (IS_RECVAR(varp)) off += reqs[i].start[0] * ncp->recsize; - assert(ncp->my_aggr >= 0); +// printf("%s at %d: num_reqs=%d reqs[%d].npairs == 1 offset_start=%lld off=%lld\n", __func__,__LINE__,num_reqs,i,reqs[i].offset_start,off); + reqs[i].offset_start += off; + } + else { + /* start/count/stride have been allocated in a contiguous array */ + MPI_Offset *count, *stride, offset_end; + count = reqs[i].start + varp->ndims; + stride = (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) ? NULL : + count + varp->ndims; + + /* calculate access range of this request */ + ncmpio_calc_start_end(ncp, varp, reqs[i].start, count, stride, + &reqs[i].offset_start, &offset_end); + } + /* check if offset_start are in a monotonic nondecreasing order */ + if (i > 0 && reqs[i].offset_start < reqs[i-1].offset_start) + descreasing = 1; + } + + /* If a decreasing order is found, sort reqs[] based on reqs[].offset_start + * into an increasing order. + */ + if (descreasing) + qsort(reqs, (size_t)num_reqs, sizeof(NC_req), req_compare); + +// printf("%s at %d: descreasing=%d\n",__func__,__LINE__, descreasing); /* construct file offset-length pairs * num_pairs: total number of off-len pairs * offsets: array of flattened offsets * lengths: array of flattened lengths + * is_incr: whether offsets are incremental */ if (num_reqs > 0) - flatten_reqs(ncp, num_reqs, put_list, &num_pairs, &offsets, &lengths); + flatten_reqs(ncp, reqMode, num_reqs, reqs, &is_incr, &num_pairs, + &offsets, &lengths); else num_pairs = 0; - /* construct write buffer datatype, bufType. - * bufLen is the buffer size in bytes +#if 0 +if (0 && num_pairs==10) printf("%s at %d: num_reqs=%d num_pairs=%ld off=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld len=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",__func__,__LINE__, num_reqs, num_pairs, +offsets[0],offsets[1],offsets[2],offsets[3],offsets[4],offsets[5], +offsets[6],offsets[7],offsets[8],offsets[9], +lengths[0],lengths[1],lengths[2],lengths[3],lengths[4],lengths[5], +lengths[6],lengths[7],lengths[8],lengths[9]); + +else if (num_pairs==12) printf("%s at %d: num_reqs=%d num_pairs=%ld off=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld len=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",__func__,__LINE__, num_reqs, num_pairs, +offsets[0],offsets[1],offsets[2],offsets[3],offsets[4], +offsets[5],offsets[6],offsets[7],offsets[8],offsets[9], +offsets[10],offsets[11], +lengths[0],lengths[1],lengths[2],lengths[3],lengths[4], +lengths[5],lengths[6],lengths[7],lengths[8],lengths[9], +lengths[10],lengths[11]); +else if (num_pairs) printf("%s at %d: num_reqs=%d num_pairs=%ld off=%lld len=%lld\n",__func__,__LINE__, num_reqs, num_pairs,offsets[0],lengths[0]); +#endif + + /* Populate buf_view, which contains metadata of the user buffers in the + * nonblocking requests. If buf is non-contiguous, buf to NULL and + * buf_view.type will be a derived datatype constructed using MPI_BOTTOM. */ - if (num_reqs > 0) { - construct_buf_type(ncp, num_reqs, put_list, &bufLen, &bufType); - bufLen = 1; - } - else - bufLen = 0; + PNCIO_View buf_view; + err = flat_buf_type(ncp, reqMode, num_reqs, reqs, &buf_view, &buf); + if (status == NC_NOERR) status = err; +if (num_reqs > 0) assert(buf != NULL); + +#if 0 +if (buf_view.count > 1) printf("%s at %d: buf_view count=%lld off=%lld %lld len=%lld %lld\n",__func__,__LINE__, buf_view.count, buf_view.off[0], buf_view.off[1], buf_view.len[0],buf_view.len[1]); +else if (buf_view.count) printf("%s at %d: buf_view count=%lld off=%lld len=%lld\n",__func__,__LINE__, buf_view.count, buf_view.off[0], buf_view.len[0]); + +{int *wkl; +int nelems, j,k, xsz=4; +char *xbuf, msg[1024],str[64]; +printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size); + wkl = (int*) malloc(buf_view.size); + nelems=buf_view.size/xsz; + xbuf = buf; + memcpy(wkl, xbuf, buf_view.size); ncmpii_in_swapn(wkl, nelems, xsz); + sprintf(msg,"%s at %d: nelems=%d buf=(%p) ",__func__,__LINE__, nelems, xbuf); + for (k=0; kaggr_time += MPI_Wtime() - timing; + if (ncp->rank == ncp->my_aggr) ncp->ina_time_flatten += MPI_Wtime() - timing; #endif - err = intra_node_aggregation(ncp, num_pairs, offsets, lengths, bufLen, - bufType, NULL); + int saved_my_aggr, saved_num_nonaggrs; + saved_my_aggr = ncp->my_aggr; + saved_num_nonaggrs = ncp->num_nonaggrs; + if (ncp->num_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) { + /* Temporarily set ncp->my_aggr and ncp->num_nonaggrs to be as if + * self rank is an INA aggregator and the INA group size is 1. + */ + ncp->my_aggr = ncp->rank; + ncp->num_nonaggrs = 1; + } + +// printf("%s at %d: is_incr=%d buf=%p\n",__func__,__LINE__, is_incr,buf); + /* perform intra-node aggregation */ + if (fIsSet(reqMode, NC_REQ_WR)) + err = ina_put(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf); + else + err = ina_get(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf); if (status == NC_NOERR) status = err; - /* free and reset bufType */ - if (bufType != MPI_BYTE && bufType != MPI_DATATYPE_NULL) - MPI_Type_free(&bufType); + if (ncp->num_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) { + /* restore ncp->my_aggr and ncp->num_nonaggrs */ + ncp->my_aggr = saved_my_aggr; + ncp->num_nonaggrs = saved_num_nonaggrs; + } + +#if 0 +if (fIsSet(reqMode, NC_REQ_RD)) +{int *wkl; +int nelems, j,k, xsz=4; +char *xbuf, msg[1024],str[64]; +printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size); + wkl = (int*) malloc(buf_view.size); + nelems=buf_view.size/xsz; + xbuf = buf; + memcpy(wkl, xbuf, buf_view.size); ncmpii_in_swapn(wkl, nelems, xsz); + sprintf(msg,"%s at %d: nelems=%d buf=(%p) ",__func__,__LINE__, nelems, xbuf); + for (k=0; k------------------------------------*/ -/* This is a collective call */ +/*----< ncmpio_ina_req() >---------------------------------------------------*/ +/* This subroutine handles a single request made by blocking APIs, involving + * only one variable. Below describe the subroutine arguments. + * reqMode: NC_REQ_RD for read request and NC_REQ_WR for write. + * varp: pointer to the variable struct. + * start[]: starting offsets + * count[]: counts along each dimension + * stride[]: stride along each dimension + * buf_len: size of I/O buffer in bytes + * buf: pointer to the user buffer + */ int -ncmpio_intra_node_aggregation(NC *ncp, - int reqMode, - NC_var *varp, - const MPI_Offset *start, - const MPI_Offset *count, - const MPI_Offset *stride, - MPI_Offset bufCount, - MPI_Datatype bufType, - void *buf) +ncmpio_ina_req(NC *ncp, + int reqMode, + NC_var *varp, + const MPI_Offset *start, + const MPI_Offset *count, + const MPI_Offset *stride, + MPI_Offset buf_len, + void *buf) { - int err, status=NC_NOERR; + int err, status=NC_NOERR, is_incr=1; MPI_Aint num_pairs; + PNCIO_View buf_view; #ifdef HAVE_MPI_LARGE_COUNT MPI_Count *offsets=NULL, *lengths=NULL; #else - MPI_Aint *offsets=NULL; + MPI_Offset *offsets=NULL; int *lengths=NULL; #endif #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) double timing = MPI_Wtime(); #endif - /* currently supports write requests only */ - if (fIsSet(reqMode, NC_REQ_RD)) return NC_NOERR; - - if (buf == NULL) /* zero-length request */ - return intra_node_aggregation(ncp, 0, NULL, NULL, 0, MPI_BYTE, NULL); - - /* construct file offset-length pairs - * num_pairs: total number of off-len pairs - * offsets: array of flattened offsets - * lengths: array of flattened lengths - */ - err = flatten_req(ncp, varp, start, count, stride, &num_pairs, &offsets, - &lengths); - if (err != NC_NOERR) { + /* blocking API's buffer passed here is always contiguous */ + buf_view.type = MPI_BYTE; + buf_view.is_contig = 1; + buf_view.size = buf_len; + buf_view.count = 0; + buf_view.off = NULL; + buf_view.len = NULL; + +// printf("%s at %d: buf=%s\n",__func__,__LINE__, (buf==NULL)?"NULL":"NOT NULL"); + if (buf_len == 0 || buf == NULL) { + /* This is a zero-length request. When in collective data mode, this + * rank must still participate collective calls. When INA is enabled, + * this rank tells its aggregator that it has no I/O data. When INA is + * disabled, this rank must participate other collective file call. + */ num_pairs = 0; - if (offsets != NULL) - NCI_Free(offsets); - offsets = NULL; + buf_view.size = 0; + buf_view.count = 0; } - status = err; + else { + /* construct file access offset-length pairs + * num_pairs: total number of off-len pairs + * offsets: array of flattened offsets + * lengths: array of flattened lengths + * is_incr: whether offsets are incremental + */ + err = flatten_req(ncp, varp, start, count, stride, &is_incr, + &num_pairs, &offsets, &lengths); + if (err != NC_NOERR) { /* make this rank zero-sized request */ + is_incr = 1; + num_pairs = 0; + buf_len = 0; + buf_view.size = 0; + buf_view.count = 0; + if (offsets != NULL) NCI_Free(offsets); + if (lengths != NULL) NCI_Free(lengths); + offsets = NULL; + lengths = NULL; + } + status = err; + } +// if (num_pairs > 0) printf("%s at %d: num_pairs=%ld off=%lld len=%lld\n",__func__,__LINE__, num_pairs,offsets[0],lengths[0]); #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) - ncp->aggr_time += MPI_Wtime() - timing; + if (ncp->rank == ncp->my_aggr) + ncp->ina_time_flatten += MPI_Wtime() - timing; #endif - err = intra_node_aggregation(ncp, num_pairs, offsets, lengths, bufCount, - bufType, buf); - if (status == NC_NOERR) status = err; + int saved_my_aggr, saved_num_nonaggrs; + saved_my_aggr = ncp->my_aggr; + saved_num_nonaggrs = ncp->num_nonaggrs; + if (ncp->num_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) { + /* Temporarily set ncp->my_aggr and ncp->num_nonaggrs to be as if + * self rank is an INA aggregator and the INA group size is 1. + */ + ncp->my_aggr = ncp->rank; + ncp->num_nonaggrs = 1; + } +// if (num_pairs) printf("%s at %d: num_pairs=%ld off=%lld len=%lld\n",__func__,__LINE__, num_pairs,offsets[0],lengths[0]); +// if (buf_view.count) printf("%s at %d: buf_view count=%lld off=%lld len=%lld\n",__func__,__LINE__, buf_view.count, buf_view.off[0], buf_view.len[0]); + +// printf("%s at %d: buf_view count=%lld size=%lld is_contig=%d buf=%p\n",__func__,__LINE__, buf_view.count,buf_view.size,buf_view.is_contig,buf); + /* perform intra-node aggregation */ + if (fIsSet(reqMode, NC_REQ_WR)) { + err = ina_put(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf); + if (status == NC_NOERR) status = err; + } + else { + err = ina_get(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf); + if (status == NC_NOERR) status = err; + } + +#if 0 +if (fIsSet(reqMode, NC_REQ_RD)) +{unsigned long long *wkl; int xsz=8; // int *wkl; int xsz=4; +int nelems, j,k; +char *xbuf, msg[1024],str[64]; +printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size); + wkl = (unsigned long long*) malloc(buf_view.size); // wkl = (int*) malloc(buf_view.size); + nelems=buf_view.size/xsz; + xbuf = buf; + memcpy(wkl, xbuf, buf_view.size); ncmpii_in_swapn(wkl, nelems, xsz); + sprintf(msg,"%s at %d: %s nelems=%d buf=(%p) ",__func__,__LINE__, ncp->path,nelems, xbuf); + // for (k=0; knum_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) { + /* restore ncp->my_aggr and ncp->num_nonaggrs */ + ncp->my_aggr = saved_my_aggr; + ncp->num_nonaggrs = saved_num_nonaggrs; + } return status; } diff --git a/src/drivers/ncmpio/ncmpio_open.c b/src/drivers/ncmpio/ncmpio_open.c index a24726ee9..7d5ec6d23 100644 --- a/src/drivers/ncmpio/ncmpio_open.c +++ b/src/drivers/ncmpio/ncmpio_open.c @@ -39,92 +39,91 @@ ncmpio_open(MPI_Comm comm, MPI_Info user_info, /* user's and env info combined */ void **ncpp) { - char *env_str, *mpi_name; - int i, mpiomode, err, status=NC_NOERR, mpireturn; - MPI_File fh; - MPI_Info info_used; + char *filename, *env_str, value[MPI_MAX_INFO_VAL + 1], *mpi_name; + int i, rank, nprocs, mpiomode, err, status=NC_NOERR, mpireturn, flag; + MPI_File fh=MPI_FILE_NULL; NC *ncp=NULL; *ncpp = NULL; + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &nprocs); + /* Note path's validity and omode consistency have been checked in - * ncmpi_open() in src/dispatchers/file.c and - * path consistency will be done in MPI_File_open */ + * ncmpi_open() in src/dispatchers/file.c and path consistency will be done + * in MPI_File_open. + */ /* First, check whether omode is valid or supported ---------------------*/ + /* NC_DISKLESS is not supported yet */ if (omode & NC_DISKLESS) DEBUG_RETURN_ERROR(NC_EINVAL_OMODE) /* NC_MMAP is not supported yet */ if (omode & NC_MMAP) DEBUG_RETURN_ERROR(NC_EINVAL_OMODE) -#if 0 && defined(HAVE_ACCESS) - if (mpiomode == MPI_MODE_RDONLY) { /* file should already exit */ - int rank, file_exist; - MPI_Comm_rank(comm, &rank); - if (rank == 0) { - if (access(path, F_OK) == 0) file_exist = 1; - else file_exist = 0; - } - TRACE_COMM(MPI_Bcast)(&file_exist, 1, MPI_INT, 0, comm); - if (!file_exist) DEBUG_RETURN_ERROR(NC_ENOENT) - } -#endif + /* allocate buffer for header object NC and initialize its contents */ + ncp = (NC*) NCI_Calloc(1, sizeof(NC)); + if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) - /* open file collectively ---------------------------------------------- */ - mpiomode = fIsSet(omode, NC_WRITE) ? MPI_MODE_RDWR : MPI_MODE_RDONLY; + *ncpp = (void*)ncp; - TRACE_IO(MPI_File_open, (comm, (char *)path, mpiomode, user_info, &fh)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); + ncp->ncid = ncid; + ncp->comm = comm; /* reuse comm duplicated in dispatch layer */ + ncp->rank = rank; + ncp->nprocs = nprocs; - /* get the file info used/modified by MPI-IO */ - TRACE_IO(MPI_File_get_info, (fh, &info_used)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); + /* Extract hints from user_info. Two hints must be extracted now in order + * to continue: + * nc_pncio: whether to user MPI-IO or PnetCDF's PNCIO driver. + * nc_num_aggrs_per_node: number of processes per node to be INA + * aggregators. + * + * ncp->fstype will be set in ncmpio_hint_extract(). + */ + ncmpio_hint_extract(ncp, user_info); - /* Now the file has been successfully opened, allocate/set NC object */ + if (ncp->fstype == PNCIO_FSTYPE_CHECK) + /* Check file system type. If the given file does not exist, check its + * folder. Currently PnetCDF's PNCIO drivers support Lustre + * (PNCIO_LUSTRE) and Unix File System (PNCIO_UFS). + */ + ncp->fstype = PNCIO_FileSysType(path); - /* path's validity and omode consistency have been checked in ncmpi_open() - * in src/dispatchers/file.c */ +#ifdef WKL_DEBUG +if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == PNCIO_FSTYPE_MPIIO)? "PNCIO_FSTYPE_MPIIO" : (ncp->fstype == PNCIO_LUSTRE) ? "PNCIO_LUSTRE" : "PNCIO_UFS"); +#endif - /* allocate buffer for header object NC */ - ncp = (NC*) NCI_Calloc(1, sizeof(NC)); - if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM) + /* Remove the file system type prefix name if there is any. For example, + * when path = "lustre:/home/foo/testfile.nc", remove "lustre:" to make + * filename pointing to "/home/foo/testfile.nc", so it can be used in POSIX + * access() below + */ + filename = ncmpii_remove_file_system_type_prefix(path); - /* PnetCDF default fill mode is no fill */ - fClr(ncp->flags, NC_MODE_FILL); - if (!fIsSet(omode, NC_WRITE)) fSet(ncp->flags, NC_MODE_RDONLY); + ncp->path = path; /* reuse path duplicated in dispatch layer */ + ncp->pncio_fh = NULL; + ncp->iomode = omode; - ncp->ncid = ncid; + ncp->collective_fh = MPI_FILE_NULL; + ncp->independent_fh = MPI_FILE_NULL; - /* chunk size for reading header (set default before check hints) */ - ncp->chunk = PNC_DEFAULT_CHUNKSIZE; + /* Setting file open mode in mpiomode which may later be needed in + * ncmpi_begin_indep_data() to open file for independent data mode. + */ + mpiomode = fIsSet(omode, NC_WRITE) ? MPI_MODE_RDWR : MPI_MODE_RDONLY; + ncp->mpiomode = mpiomode; - /* buffer to pack noncontiguous user buffers when calling wait() */ - ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE; + /* PnetCDF default fill mode is no fill */ + fClr(ncp->flags, NC_MODE_FILL); - /* Extract PnetCDF specific I/O hints from user_info and set default hint - * values into info_used. Note some MPI libraries, such as MPICH 3.3.1 and - * priors fail to preserve user hints that are not recogniozed by the MPI - * libraries. - */ - ncmpio_set_pnetcdf_hints(ncp, user_info, info_used); - - ncp->iomode = omode; - ncp->comm = comm; /* reuse comm duplicated in dispatch layer */ - MPI_Comm_rank(comm, &ncp->rank); - MPI_Comm_size(comm, &ncp->nprocs); - ncp->mpiinfo = info_used; /* is not MPI_INFO_NULL */ - ncp->mpiomode = mpiomode; - ncp->collective_fh = fh; - ncp->independent_fh = (ncp->nprocs > 1) ? MPI_FILE_NULL : fh; - ncp->path = (char*) NCI_Malloc(strlen(path) + 1); - strcpy(ncp->path, path); + /* set read-only mode */ + if (!fIsSet(omode, NC_WRITE)) fSet(ncp->flags, NC_MODE_RDONLY); #ifdef PNETCDF_DEBUG /* PNETCDF_DEBUG is set at configure time, which will be overwritten by - * the run-time environment variable PNETCDF_SAFE_MODE */ + * the run-time environment variable PNETCDF_SAFE_MODE. + */ ncp->safe_mode = 1; #endif /* If environment variable PNETCDF_SAFE_MODE is set to 1, then we perform @@ -133,17 +132,174 @@ ncmpio_open(MPI_Comm comm, if ((env_str = getenv("PNETCDF_SAFE_MODE")) != NULL) { if (*env_str == '0') ncp->safe_mode = 0; else ncp->safe_mode = 1; - /* if PNETCDF_SAFE_MODE is set but without a value, *env_str can - * be '\0' (null character). In this case, safe_mode is enabled */ + /* If PNETCDF_SAFE_MODE is set but without a value, *env_str can + * be '\0' (null character). In this case, safe_mode is enabled. + */ + } + + /* Construct a list of unique IDs of compute nodes allocated to this job + * and save it in ncp->node_ids[nprocs], which contains node IDs of each + * rank. The node IDs are used either when intra-node aggregation is + * enabled or when using PnetCDF's PNCIO driver. + * + * When intra-node aggregation is enabled, node IDs are used to create a + * new MPI communicator consisting of the intra-node aggregators only. The + * communicator will be used to call file open in MPI-IO or PnetCDF's PNCIO + * driver. This means only intra-node aggregators will perform file I/O in + * PnetCDF collective put and get operations. + */ + ncp->node_ids = NULL; + if (ncp->fstype != PNCIO_FSTYPE_MPIIO || ncp->num_aggrs_per_node != 0) { + err = ncmpii_construct_node_list(comm, &ncp->num_nodes, &ncp->node_ids); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + + /* When the total number of aggregators >= number of processes, disable + * intra-node aggregation. + */ + if (ncp->num_aggrs_per_node * ncp->num_nodes >= ncp->nprocs) + ncp->num_aggrs_per_node = 0; + } + + /* ncp->num_aggrs_per_node = 0, or > 0 indicates whether this feature + * is disabled or enabled globally for all processes. + */ + ncp->my_aggr = -1; + ncp->ina_comm = MPI_COMM_NULL; + ncp->ina_nprocs = 0; + ncp->ina_rank = -1; + ncp->ina_node_list = NULL; + if (ncp->num_aggrs_per_node > 0) { + /* Divide all ranks into groups. Each group is assigned with one + * intra-node aggregator. The following metadata related to intra-node + * aggregation will be set up. + * ncp->my_aggr is the aggregator's rank ID of this group. When == + * ncp->rank, this rank is an aggregator. + * ncp->num_nonaggrs is the number of non-aggregators assigned to this + * rank (an aggregator) + * ncp->ina_comm will be created consisting of only intra-node + * aggregators, which will be used when calling MPI_File_open(). + * For non-aggregator, ncp->ina_comm == MPI_COMM_NULL. + * ncp->node_ids[] will be modified to contain the nodes IDs of + * intra-node aggregators only, which will be passed to pncio_fh. + */ + err = ncmpio_ina_init(ncp); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + + /* As non-aggregators will not perform any file I/O, we now can replace + * comm with ina_comm. Same for nprocs. + */ + comm = ncp->ina_comm; + nprocs = ncp->ina_nprocs; + + /* For non-aggregators, comm is MPI_COMM_NULL. As the remaining task of + * this subroutine is to open the file and obtain the file handler, + * non-aggregators can skip. + */ + if (comm == MPI_COMM_NULL) { + MPI_Info_create(&ncp->mpiinfo); + goto fn_exit; + } } + /* open file collectively ---------------------------------------------- */ + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) { + TRACE_IO(MPI_File_open, (comm, path, mpiomode, user_info, &fh)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + DEBUG_FOPEN_ERROR(err); + } + + /* Now the file has been successfully opened */ + ncp->collective_fh = fh; + ncp->independent_fh = (nprocs > 1) ? MPI_FILE_NULL : fh; + + /* get the I/O hints used/modified by MPI-IO */ + TRACE_IO(MPI_File_get_info, (fh, &ncp->mpiinfo)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + DEBUG_FOPEN_ERROR(err); + } + } + else { + /* When ncp->fstype != PNCIO_FSTYPE_MPIIO, use PnetCDF's PNCIO driver */ + ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File)); + ncp->pncio_fh->file_system = ncp->fstype; + ncp->pncio_fh->num_nodes = ncp->num_nodes; + ncp->pncio_fh->node_ids = ncp->node_ids; + + err = PNCIO_File_open(comm, filename, mpiomode, user_info, + ncp->pncio_fh); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + + /* Now the file has been successfully opened, obtain the I/O hints + * used/modified by PNCIO driver. + */ + err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo); + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); + } + + /* Copy MPI-IO hints into ncp->mpiinfo */ + ncmpio_hint_set(ncp, ncp->mpiinfo); + +fn_exit: + if (ncp->num_aggrs_per_node > 0) { + /* When intra-node aggregation is enabled, it is necessary to make sure + * non-aggregators obtain consistent values of file striping hints. + * + * non-aggregator do not have hints returned from MPI_File_get_info() + */ + int striping_info[2]; + if (ncp->rank == 0) { + MPI_Info_get(ncp->mpiinfo, "striping_unit", MPI_MAX_INFO_VAL-1, + value, &flag); + striping_info[0] = 0; + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + striping_info[0] = (int)strtol(value,NULL,10); + if (errno != 0) striping_info[0] = 0; + } + + MPI_Info_get(ncp->mpiinfo, "striping_factor", MPI_MAX_INFO_VAL-1, + value, &flag); + striping_info[1] = 0; + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + striping_info[1] = (int)strtol(value,NULL,10); + if (errno != 0) striping_info[1] = 0; + } + } + + MPI_Bcast(striping_info, 2, MPI_INT, 0, ncp->comm); + + if (ncp->my_aggr != ncp->rank) { + sprintf(value, "%d", striping_info[0]); + MPI_Info_set(ncp->mpiinfo, "striping_unit", value); + sprintf(value, "%d", striping_info[1]); + MPI_Info_set(ncp->mpiinfo, "striping_factor", value); + } + } + + /* ina_node_list is no longer needed */ + if (ncp->ina_node_list != NULL) { + NCI_Free(ncp->ina_node_list); + ncp->ina_node_list = NULL; + } + /* node_ids is no longer needed */ + if (ncp->node_ids != NULL) { + NCI_Free(ncp->node_ids); + ncp->node_ids = NULL; + } + if (ncp->pncio_fh != NULL) + ncp->pncio_fh->node_ids = NULL; + /* read header from file into NC object pointed by ncp -------------------*/ err = ncmpio_hdr_get_NC(ncp); if (err == NC_ENULLPAD) status = NC_ENULLPAD; /* non-fatal error */ else if (err != NC_NOERR) { /* fatal error */ - ncmpio_close_files(ncp, 0); + ncmpio_file_close(ncp); + if (ncp->ina_comm != MPI_COMM_NULL) MPI_Comm_free(&ncp->ina_comm); ncmpio_free_NC(ncp); - return err; + DEBUG_RETURN_ERROR(err); } #ifdef ENABLE_SUBFILING @@ -152,29 +308,28 @@ ncmpio_open(MPI_Comm comm, err = ncmpio_get_att(ncp, NC_GLOBAL, "_PnetCDF_SubFiling.num_subfiles", &ncp->num_subfiles, MPI_INT); if (err == NC_NOERR && ncp->num_subfiles > 1) { - int i; /* ignore error NC_ENOTATT if this attribute is not defined */ for (i=0; ivars.ndefined; i++) { /* variables may have different numbers of subfiles */ err = ncmpio_get_att(ncp, i, "_PnetCDF_SubFiling.num_subfiles", &ncp->vars.value[i]->num_subfiles,MPI_INT); if (err == NC_ENOTATT) continue; - if (err != NC_NOERR) return err; + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); if (ncp->vars.value[i]->num_subfiles > 1) { /* find the orginal ndims of variable i */ err = ncmpio_get_att(ncp,i,"_PnetCDF_SubFiling.ndims_org", &ncp->vars.value[i]->ndims_org,MPI_INT); - if (err != NC_NOERR) return err; + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); ncp->vars.value[i]->dimids_org = (int*) NCI_Malloc( ncp->vars.value[i]->ndims_org * SIZEOF_INT); err = ncmpio_get_att(ncp,i,"_PnetCDF_SubFiling.dimids_org", ncp->vars.value[i]->dimids_org, MPI_INT); - if (err != NC_NOERR) return err; + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); } } /* open subfile */ err = ncmpio_subfile_open(ncp); - if (err != NC_NOERR) return err; + if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err); } else ncp->num_subfiles = 0; } @@ -191,21 +346,6 @@ ncmpio_open(MPI_Comm comm, ncp->vars.value[i]->attrs.hash_size = ncp->hash_size_attr; #endif - /* determine whether to enable intra-node aggregation and set up all - * intra-node aggregation metadata. - * ncp->num_aggrs_per_node = 0, or non-zero indicates whether this feature - * is enabled globally for all processes. - * ncp->my_aggr = -1 or >= 0 indicates whether aggregation is effectively - * enabled for the aggregation group of this process. - */ - ncp->my_aggr = -1; - if (ncp->num_aggrs_per_node != 0) { - err = ncmpio_intra_node_aggr_init(ncp); - if (err != NC_NOERR) return err; - } - - *ncpp = (void*)ncp; - return status; } diff --git a/src/drivers/ncmpio/ncmpio_subfile.c b/src/drivers/ncmpio/ncmpio_subfile.c index e1be70ec7..3bfb0bef6 100644 --- a/src/drivers/ncmpio/ncmpio_subfile.c +++ b/src/drivers/ncmpio/ncmpio_subfile.c @@ -315,7 +315,7 @@ int ncmpio_subfile_partition(NC *ncp) if (dpp[vpp[i]->dimids[par_dim_id]]->size/ncp->num_subfiles > 0 && vpp[i]->ndims >= par_dim_id+1 && vpp[i]->ndims >= SUBFILING_MIN_NDIMS) { - int varid, j, jj, k; + int varid, jj, k; int var_ndims = vpp[i]->ndims; /* keep org ndims */ int dimids[var_ndims]; char *key[ncp->num_subfiles][var_ndims]; @@ -1003,7 +1003,6 @@ ncmpio_subfile_getput_vars(NC *ncp, for (i=0; i #include "ncmpio_NC.h" -/*----< ncmpio_file_sync() >-------------------------------------------------*/ -/* This function must be called collectively, no matter if it is in collective - * or independent data mode. - */ -int -ncmpio_file_sync(NC *ncp) { - char *mpi_name; - int mpireturn; - - if (ncp->independent_fh != MPI_FILE_NULL) { - TRACE_IO(MPI_File_sync, (ncp->independent_fh)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - } - /* when nprocs == 1, ncp->collective_fh == ncp->independent_fh */ - if (ncp->nprocs == 1) return NC_NOERR; - - /* ncp->collective_fh is never MPI_FILE_NULL as collective mode is - * default in PnetCDF */ - TRACE_IO(MPI_File_sync, (ncp->collective_fh)); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, mpi_name); - - /* Barrier is not necessary ... - TRACE_COMM(MPI_Barrier)(ncp->comm); - */ - - return NC_NOERR; -} - #define NC_NUMRECS_OFFSET 4 /*----< ncmpio_write_numrecs() >---------------------------------------------*/ -/* root process writes the new record number into file. +/* Only root process writes the new record number into file. * This function is called by: * 1. ncmpio_sync_numrecs * 2. collective nonblocking wait API, if the new number of records is bigger @@ -69,32 +39,42 @@ int ncmpio_write_numrecs(NC *ncp, MPI_Offset new_numrecs) { - char *mpi_name; - int mpireturn, err; - MPI_File fh; - MPI_Status mpistatus; + int err=NC_NOERR; + PNCIO_View buf_view; - if (!fIsSet(ncp->flags, NC_HCOLL) && ncp->rank > 0) - /* Only root process writes numrecs in file */ - return NC_NOERR; + buf_view.type = MPI_BYTE; + buf_view.size = 0; + buf_view.count = 1; + buf_view.is_contig = 1; - /* return now if there is no record variabled defined */ + /* return now if there is no record variable defined */ if (ncp->vars.num_rec_vars == 0) return NC_NOERR; - fh = ncp->independent_fh; - if (ncp->nprocs > 1 && !NC_indep(ncp)) - fh = ncp->collective_fh; + /* When intra-node aggregation is enabled, non-aggregators do not + * participate any collective calls below. + */ + if (ncp->num_aggrs_per_node > 0 && ncp->rank != ncp->my_aggr) + return NC_NOERR; + + /* If not requiring all MPI-IO calls to be collective, non-root processes + * can return now. This is because only root process writes numrecs to the + * file header. + */ + if (!fIsSet(ncp->flags, NC_HCOLL) && ncp->rank > 0) + return NC_NOERR; + /* If collective MPI-IO is required for all MPI-IO calls, then all non-root + * processes participate the collective write call with zero-size requests. + */ if (ncp->rank > 0 && fIsSet(ncp->flags, NC_HCOLL)) { - /* other processes participate the collective call */ - TRACE_IO(MPI_File_write_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus)); - return (mpireturn == MPI_SUCCESS) ? NC_NOERR : - ncmpii_error_mpi2nc(mpireturn, mpi_name); + ncmpio_file_write_at_all(ncp, 0, NULL, buf_view); + return NC_NOERR; } if (new_numrecs > ncp->numrecs || NC_ndirty(ncp)) { int len; char pos[8], *buf=pos; + MPI_Offset wlen; /* update ncp->numrecs */ if (new_numrecs > ncp->numrecs) ncp->numrecs = new_numrecs; @@ -113,41 +93,32 @@ ncmpio_write_numrecs(NC *ncp, } /* ncmpix_put_xxx advances the 1st argument with size len */ - /* explicitly initialize mpistatus object to 0. For zero-length read, - * MPI_Get_count may report incorrect result for some MPICH version, - * due to the uninitialized MPI_Status object passed to MPI-IO calls. - * Thus we initialize it above to work around. - */ - memset(&mpistatus, 0, sizeof(MPI_Status)); - - /* root's file view always includes the entire file header */ - if (fIsSet(ncp->flags, NC_HCOLL) && ncp->nprocs > 1) { - TRACE_IO(MPI_File_write_at_all, (fh, NC_NUMRECS_OFFSET, (void*)pos, - len, MPI_BYTE, &mpistatus)); - } - else { - TRACE_IO(MPI_File_write_at, (fh, NC_NUMRECS_OFFSET, (void*)pos, - len, MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - if (err == NC_EFILE) DEBUG_RETURN_ERROR(NC_EWRITE) - } - else { - /* update the number of bytes written since file open. - * Because the above MPI write writes either 4 or 8 bytes, - * calling MPI_Get_count() is sufficient. No need to call - * MPI_Get_count_c() + if (ncp->num_aggrs_per_node > 0 && ncp->rank != ncp->my_aggr) + /* When intra-node aggregation is enabled, non-aggregators do not + * participate the collective call. */ - int put_size; - mpireturn = MPI_Get_count(&mpistatus, MPI_BYTE, &put_size); - if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED) - ncp->put_size += len; - else - ncp->put_size += put_size; + return NC_NOERR; + + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + /* reset fileview */ + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + if (err != NC_NOERR) DEBUG_RETURN_ERROR(err) } + +// printf("%s at %d: new_numrecs=%lld NC_NUMRECS_OFFSET=%d\n",__func__,__LINE__,new_numrecs,NC_NUMRECS_OFFSET); + buf_view.size = len; + + /* root's file view always includes the entire file header */ + if (fIsSet(ncp->flags, NC_HCOLL) && ncp->nprocs > 1) + wlen = ncmpio_file_write_at_all(ncp, NC_NUMRECS_OFFSET, (void*)pos, + buf_view); + else + wlen = ncmpio_file_write_at(ncp, NC_NUMRECS_OFFSET, (void*)pos, + buf_view); + if (wlen < 0) + DEBUG_RETURN_ERROR((int)wlen) } - return NC_NOERR; + return err; } /*----< ncmpio_sync_numrecs() >-----------------------------------------------*/ @@ -199,6 +170,7 @@ ncmpio_sync_numrecs(void *ncdp) return ncmpii_error_mpi2nc(mpireturn, "MPI_Allreduce"); } +// printf("%s at %d: max_numrecs=%lld\n",__func__,__LINE__,max_numrecs); /* root process writes max_numrecs to file */ status = ncmpio_write_numrecs(ncp, max_numrecs); diff --git a/src/drivers/ncmpio/ncmpio_util.c b/src/drivers/ncmpio/ncmpio_util.c index 8034f9f0b..0223977ce 100644 --- a/src/drivers/ncmpio/ncmpio_util.c +++ b/src/drivers/ncmpio/ncmpio_util.c @@ -18,267 +18,320 @@ #include #include +#include #include "ncmpio_NC.h" -/*----< ncmpio_set_pnetcdf_hints() >-----------------------------------------*/ -/* this is where the I/O hints designated to pnetcdf are extracted and their - * default values are set. +#define MAX_INT_LEN 24 + +/*----< ncmpio_hint_extract() >----------------------------------------------*/ +/* Extract hints from info. Argument info is the info object set by application + * user and passed to ncmpi_create() or ncmpi_open(). For those PnetCDF hints + * are not set in info, their default values are used. */ -void ncmpio_set_pnetcdf_hints(NC *ncp, - MPI_Info user_info, - MPI_Info info_used) +void ncmpio_hint_extract(NC *ncp, + MPI_Info info) { char value[MPI_MAX_INFO_VAL]; - int flag; + int flag, ival; + long long llval; - if (user_info == MPI_INFO_NULL) flag = 0; + assert(ncp != NULL); - /* Note info_used cannot be MPI_INFO_NULL, as it is returned from a call to - * MPI_File_get_info() - */ - assert(info_used != MPI_INFO_NULL); + ncp->info_v_align = -1; /* -1 indicates not set */ + ncp->info_r_align = -1; /* -1 indicates not set */ + + /* chunk size for reading header (set default before check hints) */ + ncp->chunk = PNC_DEFAULT_CHUNKSIZE; + + /* buffer to pack noncontiguous user buffers when calling wait() */ + ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE; + +#ifdef ENABLE_SUBFILING + ncp->subfile_mode = 0; + ncp->num_subfiles = 0; +#endif + + ncp->dims.hash_size = PNC_HSIZE_DIM; + ncp->vars.hash_size = PNC_HSIZE_VAR; + ncp->attrs.hash_size = PNC_HSIZE_GATTR; + ncp->hash_size_attr = PNC_HSIZE_VATTR; + + /* number of INA aggregators per compute node */ + ncp->num_aggrs_per_node = 0; + + /* file system type */ + ncp->fstype = PNCIO_FSTYPE_CHECK; + + if (info == MPI_INFO_NULL) return; /* nc_var_align_size, and r_align take effect when a file is created, or * opened and later adding more metadata or variable data */ - ncp->info_v_align = -1; /* -1 indicates not set */ - if (user_info != MPI_INFO_NULL) { - /* aligns starting file offsets of entire data section */ - MPI_Info_get(user_info, "nc_var_align_size", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling strtoll */ - ncp->info_v_align = strtoll(value, NULL, 10); - if (errno != 0) ncp->info_v_align = -1; - else if (ncp->info_v_align < 0) ncp->info_v_align = -1; - } + /* aligns starting file offsets of entire data section */ + MPI_Info_get(info, "nc_var_align_size", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0 && llval >= 0) + ncp->info_v_align = llval; } - if (ncp->info_v_align == -1) - sprintf(value, "%d", FILE_ALIGNMENT_DEFAULT); - else - sprintf(value, OFFFMT, ncp->info_v_align); - MPI_Info_set(info_used, "nc_var_align_size", value); - if (user_info != MPI_INFO_NULL) { - /* Hint nc_header_align_size is now deprecated. But for backward - * compatibility, let's still check. - */ - MPI_Offset info_h_align = -1; - MPI_Info_get(user_info, "nc_header_align_size", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling strtoll */ - info_h_align = strtoll(value, NULL, 10); - if (errno != 0) info_h_align = -1; - else if (info_h_align < 0) info_h_align = -1; - } - /* if nc_header_align_size is set and nc_var_align_size is not set, - * replace hint nc_var_align_size with the value of info_h_align. - */ - if (info_h_align >= 0 && ncp->info_v_align == -1) { - ncp->info_v_align = info_h_align; - sprintf(value, OFFFMT, ncp->info_v_align); - MPI_Info_set(info_used, "nc_var_align_size", value); + /* Hint nc_header_align_size is now deprecated. But for backward + * compatibility, let's still check. + */ + MPI_Info_get(info, "nc_header_align_size", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0 && llval >= 0) { + /* if nc_header_align_size is set and nc_var_align_size is not set, + * replace hint nc_var_align_size with the value of info_h_align. + */ + if (llval >= 0 && ncp->info_v_align == -1) + ncp->info_v_align = llval;; } } - ncp->info_r_align = -1; - if (user_info != MPI_INFO_NULL) { - /* aligns starting file offset of the record variable section */ - MPI_Info_get(user_info, "nc_record_align_size", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling strtoll */ - ncp->info_r_align = strtoll(value, NULL, 10); - if (errno != 0) ncp->info_r_align = -1; - else if (ncp->info_r_align < 0) ncp->info_r_align = -1; - } + /* aligns starting file offset of the record variable section */ + MPI_Info_get(info, "nc_record_align_size", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0 && llval >= 0) + ncp->info_r_align = llval; } - if (ncp->info_r_align == -1) - sprintf(value, "%d", FILE_ALIGNMENT_DEFAULT); - else - sprintf(value, OFFFMT, ncp->info_r_align); - MPI_Info_set(info_used, "nc_record_align_size", value); - ncp->chunk = PNC_DEFAULT_CHUNKSIZE; - if (user_info != MPI_INFO_NULL) { - /* header reading chunk size */ - MPI_Info_get(user_info, "nc_header_read_chunk_size", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - int chunk; - errno = 0; /* errno must set to zero before calling strtoll */ - chunk = atoi(value); - if (errno != 0) ncp->chunk = 0; - else if (ncp->chunk < 0) + /* header reading chunk size */ + MPI_Info_get(info, "nc_header_read_chunk_size", MPI_MAX_INFO_VAL-1, + value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0) { + if (llval < 0) ncp->chunk = 0; - else if (chunk > NC_MAX_INT) /* limit to NC_MAX_INT */ + else if (llval > NC_MAX_INT) /* limit to NC_MAX_INT */ ncp->chunk = NC_MAX_INT; + else + ncp->chunk = (int)llval; + + /* CDF-5's minimum header size is 4 bytes more than CDF-1 2's */ + ncp->chunk = PNETCDF_RNDUP(MAX(MIN_NC_XSZ+4, ncp->chunk), X_ALIGN); } } - sprintf(value, "%d", ncp->chunk); - MPI_Info_set(info_used, "nc_header_read_chunk_size", value); - strcpy(value, "auto"); - if (user_info != MPI_INFO_NULL) { - /* setting in-place byte swap (matters only for Little Endian) */ - MPI_Info_get(user_info, "nc_in_place_swap", MPI_MAX_INFO_VAL-1, value, &flag); - if (flag) { - if (strcasecmp(value, "enable") == 0) { - fClr(ncp->flags, NC_MODE_SWAP_OFF); - fSet(ncp->flags, NC_MODE_SWAP_ON); - } - else if (strcasecmp(value, "disable") == 0) { - fClr(ncp->flags, NC_MODE_SWAP_ON); - fSet(ncp->flags, NC_MODE_SWAP_OFF); - } - else if (strcasecmp(value, "auto") == 0) { - fClr(ncp->flags, NC_MODE_SWAP_ON); - fClr(ncp->flags, NC_MODE_SWAP_OFF); - } + /* setting in-place byte swap (matters only for Little Endian) */ + MPI_Info_get(info, "nc_in_place_swap", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + if (strcasecmp(value, "enable") == 0) { + fClr(ncp->flags, NC_MODE_SWAP_OFF); + fSet(ncp->flags, NC_MODE_SWAP_ON); + } + else if (strcasecmp(value, "disable") == 0) { + fClr(ncp->flags, NC_MODE_SWAP_ON); + fSet(ncp->flags, NC_MODE_SWAP_OFF); + } + else if (strcasecmp(value, "auto") == 0) { + fClr(ncp->flags, NC_MODE_SWAP_ON); + fClr(ncp->flags, NC_MODE_SWAP_OFF); } } - MPI_Info_set(info_used, "nc_in_place_swap", value); - if (user_info != MPI_INFO_NULL) { - /* temporal buffer size used to pack noncontiguous aggregated user - * buffers when calling ncmpi_wait/wait_all, Default 16 MiB - */ - MPI_Info_get(user_info, "nc_ibuf_size", MPI_MAX_INFO_VAL-1, value, - &flag); - if (flag) { - MPI_Offset ibuf_size; - errno = 0; /* errno must set to zero before calling strtoll */ - ibuf_size = strtoll(value, NULL, 10); - if (errno == 0 && ibuf_size >= 0) ncp->ibuf_size = ibuf_size; - } + /* Temporal buffer size used to pack non-contiguous aggregated user buffers + * when calling ncmpi_wait/wait_all. Default PNC_DEFAULT_IBUF_SIZE. + */ + MPI_Info_get(info, "nc_ibuf_size", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling strtoll */ + llval = strtoll(value, NULL, 10); + if (errno == 0 && llval >= 0) + ncp->ibuf_size = llval; } - sprintf(value, OFFFMT, ncp->ibuf_size); - MPI_Info_set(info_used, "nc_ibuf_size", value); #ifdef ENABLE_SUBFILING - ncp->subfile_mode = 0; - if (user_info != MPI_INFO_NULL) { - MPI_Info_get(user_info, "pnetcdf_subfiling", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - if (strcasecmp(value, "enable") == 0) - ncp->subfile_mode = 1; + MPI_Info_get(info, "pnetcdf_subfiling", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + if (strcasecmp(value, "enable") == 0) + ncp->subfile_mode = 1; + else { + ncp->subfile_mode = 0; + ncp->num_subfiles = 0; } } - if (ncp->subfile_mode) - MPI_Info_set(info_used, "pnetcdf_subfiling", "enable"); - else - MPI_Info_set(info_used, "pnetcdf_subfiling", "disable"); - ncp->num_subfiles = 0; - if (user_info != MPI_INFO_NULL) { - MPI_Info_get(user_info, "nc_num_subfiles", MPI_MAX_INFO_VAL-1, - value, &flag); + if (ncp->subfile_mode == 1) { + MPI_Info_get(info, "nc_num_subfiles", MPI_MAX_INFO_VAL-1, value, &flag); if (flag) { - errno = 0; - ncp->num_subfiles = atoi(value); - if (errno != 0) ncp->num_subfiles = 0; - else if (ncp->num_subfiles < 0) ncp->num_subfiles = 0; + errno = 0; /* errno must set to zero before calling atoi */ + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->num_subfiles = ival; } } - sprintf(value, "%d", ncp->num_subfiles); - MPI_Info_set(info_used, "nc_num_subfiles", value); - - if (ncp->subfile_mode == 0) ncp->num_subfiles = 0; -#else - MPI_Info_set(info_used, "pnetcdf_subfiling", "disable"); - MPI_Info_set(info_used, "nc_num_subfiles", "0"); #endif - if (user_info != MPI_INFO_NULL) { - /* If romio_no_indep_rw is set to true, let all processes participate - * the read/write file header using MPI collective APIs, where only - * rank 0 has non-zero request count. - */ - MPI_Info_get(user_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - if (strcasecmp(value, "true") == 0) - fSet((ncp)->flags, NC_HCOLL); - } + /* Hash table size for dimensions */ + MPI_Info_get(info, "nc_hash_size_dim", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling atoi */ + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->dims.hash_size = ival; } - ncp->dims.hash_size = PNC_HSIZE_DIM; - if (user_info != MPI_INFO_NULL) { - /* Hash table size for dimensions */ - MPI_Info_get(user_info, "nc_hash_size_dim", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling atoi */ - ncp->dims.hash_size = atoi(value); - if (errno != 0 || ncp->dims.hash_size < 0) - ncp->dims.hash_size = PNC_HSIZE_DIM; - } + /* Hash table size for variables */ + MPI_Info_get(info, "nc_hash_size_var", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling atoi */ + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->vars.hash_size = ival; } - sprintf(value, "%d", ncp->dims.hash_size); - MPI_Info_set(info_used, "nc_hash_size_dim", value); - ncp->vars.hash_size = PNC_HSIZE_VAR; - if (user_info != MPI_INFO_NULL) { - /* Hash table size for variables */ - MPI_Info_get(user_info, "nc_hash_size_var", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling atoi */ - ncp->vars.hash_size = atoi(value); - if (errno != 0 || ncp->vars.hash_size < 0) - ncp->vars.hash_size = PNC_HSIZE_VAR; - } + /* Hash table size for global attributes */ + MPI_Info_get(info, "nc_hash_size_gattr", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling atoi */ + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->attrs.hash_size = ival; } - sprintf(value, "%d", ncp->vars.hash_size); - MPI_Info_set(info_used, "nc_hash_size_var", value); - ncp->attrs.hash_size = PNC_HSIZE_GATTR; - if (user_info != MPI_INFO_NULL) { - /* Hash table size for global attributes */ - MPI_Info_get(user_info, "nc_hash_size_gattr", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling atoi */ - ncp->attrs.hash_size = atoi(value); - if (errno != 0 || ncp->attrs.hash_size < 0) - ncp->attrs.hash_size = PNC_HSIZE_GATTR; - } + /* Hash table size for non-global attributes */ + MPI_Info_get(info, "nc_hash_size_vattr", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag) { + errno = 0; /* errno must set to zero before calling atoi */ + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->hash_size_attr = ival; } - sprintf(value, "%d", ncp->attrs.hash_size); - MPI_Info_set(info_used, "nc_hash_size_gattr", value); - ncp->hash_size_attr = PNC_HSIZE_VATTR; - if (user_info != MPI_INFO_NULL) { - /* Hash table size for non-global attributes */ - MPI_Info_get(user_info, "nc_hash_size_vattr", MPI_MAX_INFO_VAL-1, - value, &flag); + /* Number of intra-node aggregators per compute node. */ + if (ncp->nprocs > 1) { + MPI_Info_get(info, "nc_num_aggrs_per_node", MPI_MAX_INFO_VAL-1, value, + &flag); if (flag) { errno = 0; /* errno must set to zero before calling atoi */ - ncp->hash_size_attr = atoi(value); - if (errno != 0 || ncp->hash_size_attr < 0) - ncp->hash_size_attr = PNC_HSIZE_VATTR; + ival = atoi(value); + if (errno == 0 && ival >= 0) + ncp->num_aggrs_per_node = ival; } } - sprintf(value, "%d", ncp->hash_size_attr); - MPI_Info_set(info_used, "nc_hash_size_vattr", value); - ncp->num_aggrs_per_node = 0; - if (user_info != MPI_INFO_NULL) { - /* Hash table size for non-global attributes */ - MPI_Info_get(user_info, "nc_num_aggrs_per_node", MPI_MAX_INFO_VAL-1, - value, &flag); - if (flag) { - errno = 0; /* errno must set to zero before calling atoi */ - ncp->num_aggrs_per_node = atoi(value); - if (errno != 0 || ncp->num_aggrs_per_node < 0) - ncp->num_aggrs_per_node = 0; + /* If user explicitly want to use MPI-IO instead of PnetCDF's internal PNCIO + * driver, then set PnetCDF I/O hint "nc_pncio" to "disable". + */ + MPI_Info_get(info, "nc_pncio", MPI_MAX_INFO_VAL-1, value, &flag); + if (flag && strcasecmp(value, "disable") == 0) + ncp->fstype = PNCIO_FSTYPE_MPIIO; +} + +/*----< ncmpio_hint_set() >--------------------------------------------------*/ +/* Insert PnetCDF hints into info. Argument info is the info object returned + * from an earlier call to MPI_File_get_info(). + */ +void ncmpio_hint_set(NC *ncp, + MPI_Info info) +{ + char int_str[MAX_INT_LEN]; + + assert(ncp != NULL); + assert(info != MPI_INFO_NULL); + + /* nc_var_align_size, and r_align take effect when a file is created, or + * opened and later adding more metadata or variable data + */ + + /* aligns starting file offsets of entire data section */ + if (ncp->info_v_align != -1) { + snprintf(int_str, MAX_INT_LEN, OFFFMT, ncp->info_v_align); + MPI_Info_set(info, "nc_var_align_size", int_str); + } + + /* aligns starting file offset of the record variable section */ + if (ncp->info_r_align != -1) { + snprintf(int_str, MAX_INT_LEN, OFFFMT, ncp->info_r_align); + MPI_Info_set(info, "nc_record_align_size", int_str); + } + + /* header reading chunk size */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->chunk); + MPI_Info_set(info, "nc_header_read_chunk_size", int_str); + + /* setting in-place byte swap (matters only for Little Endian) */ + int swap_on = fIsSet(ncp->flags, NC_MODE_SWAP_ON); + int swap_off = fIsSet(ncp->flags, NC_MODE_SWAP_OFF); + if (!swap_on && !swap_off) + MPI_Info_set(info, "nc_in_place_swap", "auto"); + else if (swap_on) + MPI_Info_set(info, "nc_in_place_swap", "enable"); + else + MPI_Info_set(info, "nc_in_place_swap", "disable"); + + /* Temporal buffer size used to pack non-contiguous aggregated user buffers + * when calling ncmpi_wait/wait_all. Default PNC_DEFAULT_IBUF_SIZE. + */ + snprintf(int_str, MAX_INT_LEN, OFFFMT, ncp->ibuf_size); + MPI_Info_set(info, "nc_ibuf_size", int_str); + +#ifdef ENABLE_SUBFILING + if (ncp->subfile_mode) + MPI_Info_set(info, "pnetcdf_subfiling", "enable"); + else + MPI_Info_set(info, "pnetcdf_subfiling", "disable"); + + snprintf(int_str, MAX_INT_LEN, "%d", ncp->num_subfiles); + MPI_Info_set(info, "nc_num_subfiles", int_str); +#endif + + /* Hash table size for dimensions */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->dims.hash_size); + MPI_Info_set(info, "nc_hash_size_dim", int_str); + + /* Hash table size for variables */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->vars.hash_size); + MPI_Info_set(info, "nc_hash_size_var", int_str); + + /* Hash table size for global attributes */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->attrs.hash_size); + MPI_Info_set(info, "nc_hash_size_gattr", int_str); + + /* Hash table size for non-global attributes */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->hash_size_attr); + MPI_Info_set(info, "nc_hash_size_vattr", int_str); + + /* Whether using MPI-IO instead of PnetCDF's internal PNCIO driver. */ + if (ncp->fstype == PNCIO_FSTYPE_MPIIO) + MPI_Info_set(info, "nc_pncio", "disable"); + else + MPI_Info_set(info, "nc_pncio", "enable"); + + if (ncp->num_aggrs_per_node > 0) { + /* Number of intra-node aggregators per compute node. */ + snprintf(int_str, MAX_INT_LEN, "%d", ncp->num_aggrs_per_node); + MPI_Info_set(info, "nc_num_aggrs_per_node", int_str); + + /* Add hint "ina_node_list", list of INA aggregators' rank IDs */ + if (ncp->ina_node_list != NULL) { + char value[MPI_MAX_INFO_VAL]; + int i; + snprintf(value, MAX_INT_LEN, "%d", ncp->ina_node_list[0]); + for (i=1; iina_nprocs; i++) { + snprintf(int_str, sizeof(int_str), " %d", ncp->ina_node_list[i]); + if (strlen(value) + strlen(int_str) >= MPI_MAX_INFO_VAL-5) { + strcat(value, " ..."); + break; + } + strcat(value, int_str); + } + MPI_Info_set(info, "nc_ina_node_list", value); } } - sprintf(value, "%d", ncp->num_aggrs_per_node); - MPI_Info_set(info_used, "nc_num_aggrs_per_node", value); + else /* Update hint "num_aggrs_per_node" to indicate disabled. */ + MPI_Info_set(info, "nc_num_aggrs_per_node", "0"); } /*----< ncmpio_first_offset() >-----------------------------------------------*/ @@ -730,12 +783,12 @@ ncmpio_unpack_xbuf(int fmt, /* NC_FORMAT_CDF2 NC_FORMAT_CDF5 etc. */ break; } /* The only error codes returned from the above switch block are - * NC_EBADTYPE or NC_ERANGE. Bad varp->xtype and itype have been sanity - * checked at the dispatchers, so NC_EBADTYPE is not possible. Thus, - * the only possible error is NC_ERANGE. NC_ERANGE can be caused by - * one or more elements of buf that is out of range representable by - * the external data type, it is not considered a fatal error. This - * request must continue to finish. + * NC_EBADTYPE or NC_ERANGE. Bad varp->xtype and itype have been sanity + * checked at the dispatchers, so NC_EBADTYPE is not possible. Thus, + * the only possible error is NC_ERANGE. NC_ERANGE can be caused by + * one or more elements of buf that is out of range representable by + * the external data type, it is not considered a fatal error. This + * request must continue to finish. */ } else { @@ -785,30 +838,36 @@ ncmpio_unpack_xbuf(int fmt, /* NC_FORMAT_CDF2 NC_FORMAT_CDF5 etc. */ MPI_Type_free(&imaptype); } - /* unpacked lbuf into buf based on buftype -----------------------------*/ - if (!buftype_is_contig && lbuf != buf) { - /* no need unpack when buftype is used in MPI_File_read (lbuf == buf) */ + /* Unpacked lbuf into buf based on buftype. Note no need to unpack when + * buftype is used in MPI_File_read, i.e. lbuf == buf. + */ + if (lbuf != buf) { + if (buftype_is_contig) + memcpy(buf, lbuf, ibuf_size); + else { /* buftye is not contiguous */ #ifdef HAVE_MPI_LARGE_COUNT - MPI_Count position = 0; - mpireturn = MPI_Unpack_c(lbuf, (MPI_Count)ibuf_size, &position, buf, - (MPI_Count)bufcount, buftype, MPI_COMM_SELF); - if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack_c"); -#else - if (bufcount > NC_MAX_INT) { - if (err == NC_NOERR) - DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) - } - else { - int position = 0; - if (ibuf_size > NC_MAX_INT) - DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) - mpireturn = MPI_Unpack(lbuf, (int)ibuf_size, &position, buf, - (int)bufcount, buftype, MPI_COMM_SELF); + MPI_Count position = 0; + mpireturn = MPI_Unpack_c(lbuf, (MPI_Count)ibuf_size, &position, + buf, (MPI_Count)bufcount, buftype, + MPI_COMM_SELF); if (mpireturn != MPI_SUCCESS) - return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack"); - } + return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack_c"); +#else + if (bufcount > NC_MAX_INT) { + if (err == NC_NOERR) + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) + } + else { + int position = 0; + if (ibuf_size > NC_MAX_INT) + DEBUG_RETURN_ERROR(NC_EINTOVERFLOW) + mpireturn = MPI_Unpack(lbuf, (int)ibuf_size, &position, buf, + (int)bufcount, buftype, MPI_COMM_SELF); + if (mpireturn != MPI_SUCCESS) + return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack"); + } #endif + } } if (free_cbuf) NCI_Free(cbuf); if (free_lbuf) NCI_Free(lbuf); @@ -816,3 +875,136 @@ ncmpio_unpack_xbuf(int fmt, /* NC_FORMAT_CDF2 NC_FORMAT_CDF5 etc. */ return err; } +/*----< ncmpio_calc_off() >--------------------------------------------------*/ +/* Returns the starting file offset of a subarray request. + * Note zero-length request should never call this subroutine. + * Only a single offset-length pair will call this subroutine. + */ +int +ncmpio_calc_off(const NC *ncp, + const NC_var *varp, + const MPI_Offset *start, /* [varp->ndims] */ + MPI_Offset *offset) /* OUT: start offset */ +{ + int i, ndims = varp->ndims; /* number of dimensions of this variable */ + + /* + * varp->dsizes[] is computed from right to left product of shape + * For example, a 3D array of size 5x4x3 in C order, + * For fixed-size variable: dsizes[0]=60 dsizes[1]=12 dsizes[2]=3 + * For record variable: dsizes[0]=12 dsizes[1]=12 dsizes[2]=3 + */ + if (IS_RECVAR(varp)) { + *offset = 0; + if (ndims > 1) { + /* start from the least significant dimension */ + *offset = start[ndims-1]; + /* the remaining dimensions */ + for (i=ndims-2; i>0; i--) + *offset += start[i]*varp->dsizes[i+1]; + } + *offset *= varp->xsz; /* offset in bytes */ + } + else { + /* first handle the least significant dimension */ + *offset = start[ndims-1]; + /* remaining dimensions till the most significant dimension */ + for (i=ndims-2; i>=0; i--) + *offset += start[i] * varp->dsizes[i+1]; + *offset *= varp->xsz; /* offset in bytes */ + } + + return NC_NOERR; +} + +/*----< ncmpio_calc_start_end() >--------------------------------------------*/ +/* Returns the file offsets of access range of this request: starting file + * offset and end offset (exclusive). + * Note zero-length request should never call this subroutine. + */ +int +ncmpio_calc_start_end(const NC *ncp, + const NC_var *varp, + const MPI_Offset *start, /* [varp->ndims] */ + const MPI_Offset *count, /* [varp->ndims] */ + const MPI_Offset *stride, /* [varp->ndims] */ + MPI_Offset *start_off, /* OUT: start offset */ + MPI_Offset *end_off) /* OUT: end offset */ +{ + int i, ndims = varp->ndims; /* number of dimensions of this variable */ + + /* + * varp->dsizes[] is computed from right to left product of shape + * For example, a 3D array of size 5x4x3 in C order, + * For fixed-size variable: dsizes[0]=60 dsizes[1]=12 dsizes[2]=3 + * For record variable: dsizes[0]=12 dsizes[1]=12 dsizes[2]=3 + */ + if (IS_RECVAR(varp)) { + *start_off = 0; + *end_off = 0; + if (stride == NULL) { + if (ndims > 1) { + /* least significant dimension */ + *start_off = start[ndims-1]; + *end_off = start[ndims-1]+(count[ndims-1]-1); + /* the remaining dimensions */ + for (i=ndims-2; i>0; i--) { + *start_off += start[i]*varp->dsizes[i+1]; + *end_off += (start[i]+(count[i]-1))*varp->dsizes[i+1]; + } + } + *start_off *= varp->xsz; /* offset in bytes */ + *end_off *= varp->xsz; + /* handle the unlimited, most significant dimension */ + *start_off += start[0] * ncp->recsize; + *end_off += (start[0]+(count[0]-1)) * ncp->recsize; + } + else { + if (ndims > 1) { + /* least significant dimension */ + *start_off = start[ndims-1]; + *end_off = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1]; + /* the remaining dimensions */ + for (i=ndims-2; i>0; i--) { + *start_off += start[i]*varp->dsizes[i+1]; + *end_off += (start[i]+(count[i]-1)*stride[i]) * + varp->dsizes[i+1]; + } + } + *start_off *= varp->xsz; /* offset in bytes */ + *end_off *= varp->xsz; + /* handle the unlimited, most significant dimension */ + *start_off += start[0] * ncp->recsize; + *end_off += (start[0]+(count[0]-1)*stride[0]) * ncp->recsize; + } + } + else { + if (stride == NULL) { + /* first handle the least significant dimension */ + *start_off = start[ndims-1]; + *end_off = start[ndims-1] + (count[ndims-1]-1); + /* remaining dimensions till the most significant dimension */ + for (i=ndims-2; i>=0; i--) { + *start_off += start[i] * varp->dsizes[i+1]; + *end_off += (start[i]+(count[i]-1)) * varp->dsizes[i+1]; + } + } + else { + /* first handle the least significant dimension */ + *start_off = start[ndims-1]; + *end_off = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1]; + /* remaining dimensions till the most significant dimension */ + for (i=ndims-2; i>=0; i--) { + *start_off += start[i] * varp->dsizes[i+1]; + *end_off += (start[i]+(count[i]-1)*stride[i])*varp->dsizes[i+1]; + } + } + *start_off *= varp->xsz; /* offset in bytes */ + *end_off *= varp->xsz; + } + *start_off += varp->begin; /* beginning file offset of this variable */ + *end_off += varp->begin + varp->xsz; + + return NC_NOERR; +} + diff --git a/src/drivers/ncmpio/ncmpio_vard.c b/src/drivers/ncmpio/ncmpio_vard.c index 7f3fe1224..ac032d1ac 100644 --- a/src/drivers/ncmpio/ncmpio_vard.c +++ b/src/drivers/ncmpio/ncmpio_vard.c @@ -55,9 +55,8 @@ getput_vard(NC *ncp, void *xbuf=NULL; int mpireturn, status=NC_NOERR, err=NC_NOERR, xtype_is_contig=1; int el_size, buftype_is_contig=0, need_swap_back_buf=0; - int need_convert=0, need_swap=0, coll_indep, rw_flag; - MPI_File fh; - MPI_Offset nelems=0, fnelems=0, bnelems=0, offset=0; + int need_convert=0, need_swap=0; + MPI_Offset fnelems=0, bnelems=0, offset=0; MPI_Datatype etype=MPI_DATATYPE_NULL, xtype=MPI_BYTE; MPI_Offset filetype_size=0; #ifdef HAVE_MPI_TYPE_SIZE_C @@ -71,6 +70,17 @@ getput_vard(NC *ncp, int type_size; #endif + if (ncp->fstype != PNCIO_FSTYPE_MPIIO) { + fprintf(stderr, "PnetCDF vard APIs are only supported when using MPI-IO.\n"); + fprintf(stderr, "Please set environment variable PNETCDF_HINTS to \"nc_pncio=disable\"\n"); + return NC_ENOTSUPPORT; + } + + if (ncp->num_aggrs_per_node > 0) { + fprintf(stderr, "PnetCDF vard APIs are not supported when intra-node agggregation is enabled\n"); + return NC_ENOTSUPPORT; + } + #ifdef ENABLE_SUBFILING /* call a separate routine if variable is stored in subfiles */ if (varp->num_subfiles > 1) { @@ -170,7 +180,7 @@ getput_vard(NC *ncp, bnelems = bufcount; } else { - /* find the element type of filetype. ncmpii_dtype_decode() checks + /* find the element type of buftype. ncmpii_dtype_decode() checks * NC_EMULTITYPES */ err = ncmpii_dtype_decode(buftype, &etype, &el_size, &bnelems, NULL, &buftype_is_contig); @@ -214,8 +224,8 @@ getput_vard(NC *ncp, } } - if (!need_convert && - (!need_swap || (can_swap_in_place && buftype_is_contig))) { + if (!need_convert && buftype_is_contig && + (!need_swap || can_swap_in_place)) { /* reuse buftype, bufcount, buf in later MPI file write */ xbuf = buf; if (need_swap) { @@ -246,7 +256,7 @@ getput_vard(NC *ncp, } } else { /* read request */ - if (!need_convert && (!need_swap || buftype_is_contig)) { + if (!need_convert && !need_swap && buftype_is_contig) { /* reuse buftype, bufcount, buf in later MPI file read */ xbuf = buf; } @@ -259,18 +269,7 @@ getput_vard(NC *ncp, xtype_is_contig = 1; } } - - /* Set nelems and xtype which will be used in MPI read/write */ - if (buf != xbuf) { - /* xbuf is a malloc-ed contiguous buffer */ - nelems = bnelems; - } - else { - /* we can safely use bufcount and buftype in MPI File read/write. - * Note buftype may be noncontiguous. */ - nelems = bufcount; - xtype = buftype; - } +assert(xtype_is_contig == 1); /* set fileview's displacement to the variable's starting file offset */ offset = varp->begin; @@ -296,7 +295,6 @@ getput_vard(NC *ncp, */ offset = 0; bufcount = 0; - nelems = 0; filetype_size = 0; filetype = MPI_BYTE; buftype = MPI_BYTE; @@ -305,31 +303,79 @@ getput_vard(NC *ncp, } status = err; + /* set the MPI-IO fileview, this is a collective call */ +#if 1 + /* vard API is only supported when using MPI-IO, not PNCIO */ + char *mpi_name; + MPI_File fh; + /* when ncp->nprocs == 1, ncp->collective_fh == ncp->independent_fh */ - fh = ncp->independent_fh; - coll_indep = NC_REQ_INDEP; - if (ncp->nprocs > 1 && fIsSet(reqMode, NC_REQ_COLL)) { - fh = ncp->collective_fh; - coll_indep = NC_REQ_COLL; - } + fh = (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP)) + ? ncp->collective_fh : ncp->independent_fh; - /* set the MPI-IO fileview, this is a collective call */ - err = ncmpio_file_set_view(ncp, fh, &offset, filetype); + TRACE_IO(MPI_File_set_view, (fh, offset, MPI_BYTE, filetype, "native", + MPI_INFO_NULL)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (status == NC_NOERR) status = err; + } +#else + err = ncmpio_file_set_view(ncp, offset, filetype, 0, NULL, NULL); +#endif if (err != NC_NOERR) { if (status == NC_NOERR) status = err; - nelems = 0; /* skip this request */ + filetype_size = 0; /* skip this request */ } - rw_flag = (fIsSet(reqMode, NC_REQ_RD)) ? NC_REQ_RD : NC_REQ_WR; +#if 1 + /* vard API is only supported when using MPI-IO, not PNCIO */ + int coll_indep = NC_REQ_INDEP; + if (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP)) + coll_indep = NC_REQ_COLL; + + PNCIO_View buf_view; + buf_view.type = MPI_BYTE; + buf_view.size = filetype_size; + buf_view.count = 1; + buf_view.is_contig = 1; + + if (fIsSet(reqMode, NC_REQ_RD)) { + MPI_Offset rlen; + + if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) + rlen = ncmpio_file_read_at_all(ncp, 0, xbuf, buf_view); + else + rlen = ncmpio_file_read_at_all(ncp, 0, xbuf, buf_view); + if (status == NC_NOERR && rlen < 0) status = (int)rlen; + } + else { + MPI_Offset wlen; - err = ncmpio_read_write(ncp, rw_flag, coll_indep, offset, nelems, - xtype, xbuf, xtype_is_contig); + if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) + wlen = ncmpio_file_write_at_all(ncp, 0, xbuf, buf_view); + else + wlen = ncmpio_file_write_at(ncp, 0, xbuf, buf_view); + if (status == NC_NOERR && wlen < 0) status = (int)wlen; + } +#else + int rw_flag = (fIsSet(reqMode, NC_REQ_RD)) ? NC_REQ_RD : NC_REQ_WR; + + err = ncmpio_read_write(ncp, rw_flag, 0, nelems, xtype, xbuf); if (status == NC_NOERR) status = err; +#endif - /* No longer need to reset the file view, as the root's fileview includes - * the whole file header. - MPI_File_set_view(fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL); - */ + /* reset fileview to make entire file visible */ +#if 1 + TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", + MPI_INFO_NULL)); + if (mpireturn != MPI_SUCCESS) { + err = ncmpii_error_mpi2nc(mpireturn, mpi_name); + if (status == NC_NOERR) status = err; + } +#else + err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL); + if (status == NC_NOERR) status = err; +#endif if (fIsSet(reqMode, NC_REQ_RD)) { if (filetype_size == 0) return status; diff --git a/src/drivers/ncmpio/ncmpio_wait.c b/src/drivers/ncmpio/ncmpio_wait.c index fc635acfa..236499db1 100644 --- a/src/drivers/ncmpio/ncmpio_wait.c +++ b/src/drivers/ncmpio/ncmpio_wait.c @@ -34,71 +34,6 @@ NetCDF XDR Level xbuf (XDR I/O buffer) */ -/* Prototypes for functions used only in this file */ -static int wait_getput(NC *ncp, int num_reqs, NC_req *reqs, int rw_flag, - int coll_indep, MPI_Offset newnumrecs); - -static int mgetput(NC *ncp, int num_reqs, NC_req *reqs, int rw_flag, - int coll_indep); - -/*----< ncmpio_getput_zero_req() >-------------------------------------------*/ -/* This function is called when this process has zero-length I/O request and - * must participate all the MPI collective calls involved in the collective - * APIs and wait_all(), which include setting fileview, collective read/write, - * another setting fileview. - * - * This function is collective. - */ -int -ncmpio_getput_zero_req(NC *ncp, int reqMode) -{ - char *mpi_name; - int err, mpireturn, status=NC_NOERR; - MPI_Status mpistatus; - MPI_File fh; - - /* do nothing if this came from an independent API */ - if (fIsSet(reqMode, NC_REQ_INDEP)) return NC_NOERR; - - fh = ncp->collective_fh; - - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL)); - - if (fIsSet(reqMode, NC_REQ_RD)) { - if (ncp->nprocs > 1) { - TRACE_IO(MPI_File_read_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus)); - } - else { - TRACE_IO(MPI_File_read_at, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - err = (err == NC_EFILE) ? NC_EREAD : err; - DEBUG_ASSIGN_ERROR(status, err) - } - } else { /* write request */ - if (ncp->nprocs > 1) { - TRACE_IO(MPI_File_write_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus)); - } - else { - TRACE_IO(MPI_File_write_at, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus)); - } - if (mpireturn != MPI_SUCCESS) { - err = ncmpii_error_mpi2nc(mpireturn, mpi_name); - err = (err == NC_EFILE) ? NC_EWRITE : err; - DEBUG_ASSIGN_ERROR(status, err) - } - } - - /* No longer need to reset the file view, as the root's fileview includes - * the whole file header. - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", - MPI_INFO_NULL)); - */ - - return status; -} - /*----< abuf_coalesce() >----------------------------------------------------*/ /* this function should be called after all bput requests have been served */ static int @@ -332,389 +267,115 @@ ncmpio_cancel(void *ncdp, return status; } -/*----< construct_filetypes() >----------------------------------------------*/ -/* concatenate the requests into a single MPI derived filetype */ +/*----< extract_reqs() >-----------------------------------------------------*/ +/* extract requests from the queues into new queues to be committed. + * Input value of num_reqs can be NC_REQ_ALL, NC_GET_REQ_ALL, or NC_PUT_REQ_ALL + */ static int -construct_filetypes(NC *ncp, - NC_lead_req *lead_list, /* NC_REQ_WR or NC_REQ_RD */ - int num_reqs, -#ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *blocklens, /* [num_reqs] temp buffer */ - MPI_Count *disps, /* [num_reqs] temp buffer */ -#else - int *blocklens, /* [num_reqs] temp buffer */ - MPI_Aint *disps, /* [num_reqs] temp buffer */ -#endif - NC_req *reqs, /* [num_reqs] */ - MPI_Datatype *filetype) /* OUT */ +extract_reqs(NC *ncp, + int num_reqs, + int *req_ids, /* IN: [num_reqs] or NULL */ + int *statuses, /* IN: [num_reqs] or NULL */ + int *num_r_lead_reqs, /* OUT: no. lead get requests */ + int *num_r_reqs, /* OUT: no. non-lead get requests */ + NC_req **get_list, /* OUT: extracted get requests */ + int *num_w_lead_reqs, /* OUT: no. lead put requests */ + int *num_w_reqs, /* OUT: no. non-lead put requests */ + NC_req **put_list) /* OUT: extracted put requests */ { - int i, j, err, status=NC_NOERR, all_ftype_contig=1, last_contig_req; - int mpireturn; - MPI_Datatype *ftypes; - - if (num_reqs <= 0) { /* for participating collective call */ - *filetype = MPI_BYTE; - return NC_NOERR;; - } + int i, j, status=NC_NOERR; + NC_req *put_list_ptr, *get_list_ptr; - /* hereinafter, num_reqs > 0 */ - ftypes = (MPI_Datatype*) NCI_Malloc(sizeof(MPI_Datatype) * num_reqs); + *num_r_lead_reqs = 0; + *num_w_lead_reqs = 0; + *num_r_reqs = 0; + *num_w_reqs = 0; - /* create a filetype for each request */ - last_contig_req = -1; /* index of the last contiguous request */ - j = 0; /* index of last valid ftypes */ - for (i=0; inumLeadPutReqs; i++) + fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); - lead = lead_list + reqs[i].lead_off; - ndims = lead->varp->ndims; + *num_w_lead_reqs = ncp->numLeadPutReqs; + *num_w_reqs = ncp->numPutReqs; + *put_list = ncp->put_list; + ncp->numPutReqs = 0; + ncp->put_list = NULL; + } + if (num_reqs == NC_GET_REQ_ALL || num_reqs == NC_REQ_ALL) { + /* the entire get requests */ + for (i=0; inumLeadGetReqs; i++) + fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); - ftypes[j] = MPI_BYTE; /* in case the call below failed */ + *num_r_lead_reqs = ncp->numLeadGetReqs; + *num_r_reqs = ncp->numGetReqs; + *get_list = ncp->get_list; + ncp->numGetReqs = 0; + ncp->get_list = NULL; + } + if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL || + num_reqs == NC_PUT_REQ_ALL) + return NC_NOERR; - if (ndims == 0) { /* scalar variable */ -#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET - if (lead->varp->begin > NC_MAX_INT) { - DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) - fSet(lead->flag, NC_REQ_SKIP); /* skip this request */ - if ( lead->status != NULL && - *lead->status == NC_NOERR) - *lead->status = err; - if (status == NC_NOERR) - status = err; /* report first error */ + if (ncp->numGetReqs == 0 && num_reqs == ncp->numLeadPutReqs) { + /* this is the same as NC_PUT_REQ_ALL */ + for (i=0; inumLeadPutReqs; i++) { + ncp->put_lead_list[i].status = statuses + i; + statuses[i] = NC_NOERR; } -#endif - disps[j] = lead->varp->begin; - is_ftype_contig = 1; } - else { /* non-scalar variable */ - MPI_Offset offset, *count, *stride; - count = reqs[i].start + ndims; - stride = fIsSet(lead->flag, NC_REQ_STRIDE_NULL) ? - NULL : count + ndims; - - err = ncmpio_filetype_create_vars(ncp, - lead->varp, - reqs[i].start, - count, - stride, - &offset, - &ftypes[j], - &is_ftype_contig); - -#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET - if (err == NC_NOERR && offset > NC_MAX_INT) - DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) -#endif - disps[j] = (MPI_Aint)offset; + for (i=0; inumLeadPutReqs; i++) + fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); - if (err != NC_NOERR) { - fSet(lead->flag, NC_REQ_SKIP); /* skip this request */ - if ( lead->status != NULL && - *lead->status == NC_NOERR) - *lead->status = err; - if (status == NC_NOERR) status = err; /* report first error */ - continue; + *num_w_lead_reqs = ncp->numLeadPutReqs; + *num_w_reqs = ncp->numPutReqs; + *put_list = ncp->put_list; + ncp->numPutReqs = 0; + ncp->put_list = NULL; + return NC_NOERR; + } + if (ncp->numPutReqs == 0 && num_reqs == ncp->numLeadGetReqs) { + /* this is the same as NC_GET_REQ_ALL */ + for (i=0; inumLeadGetReqs; i++) { + ncp->get_lead_list[i].status = statuses + i; + statuses[i] = NC_NOERR; } } + for (i=0; inumLeadGetReqs; i++) + fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); - if (is_ftype_contig) { - MPI_Offset coalesced_len; + *num_r_lead_reqs = ncp->numLeadGetReqs; + *num_r_reqs = ncp->numGetReqs; + *get_list = ncp->get_list; + ncp->numGetReqs = 0; + ncp->get_list = NULL; + return NC_NOERR; + } + if (num_reqs == ncp->numLeadPutReqs + ncp->numLeadGetReqs && + statuses == NULL) { + /* this is the same as NC_REQ_ALL */ + for (i=0; ivarp->xsz * reqs[i].nelems; + for (i=0; inumLeadGetReqs; i++) + fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); + *num_w_lead_reqs = ncp->numLeadPutReqs; + *num_w_reqs = ncp->numPutReqs; + *put_list = ncp->put_list; + ncp->numPutReqs = 0; + ncp->put_list = NULL; -#ifdef HAVE_MPI_LARGE_COUNT - blocklens[j] = coalesced_len; -#else - if (coalesced_len > NC_MAX_INT) { - DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) - if (status == NC_NOERR) - status = err; /* report first error */ - coalesced_len = 0; - } - blocklens[j] = (int)coalesced_len; -#endif - if (last_contig_req >= 0) - coalesced_len += blocklens[last_contig_req]; -#ifdef HAVE_MPI_LARGE_COUNT - if (last_contig_req >= 0 && - disps[j] - disps[last_contig_req] == - blocklens[last_contig_req]) { - blocklens[last_contig_req] = coalesced_len; - j--; - } - else last_contig_req = j; -#else - /* if coalesced_len overflows 4-byte int, then skip coalescing */ - if (coalesced_len < NC_MAX_INT && last_contig_req >= 0 && - disps[j] - disps[last_contig_req] == - blocklens[last_contig_req]) { - blocklens[last_contig_req] = (int)coalesced_len; - j--; - } - else last_contig_req = j; -#endif - } - else { - /* we will construct a filetype, set blocklen to 1 */ - blocklens[j] = 1; - last_contig_req = -1; - all_ftype_contig = 0; - } - } - /* j is the new num_reqs */ - num_reqs = j; - - if (status != NC_NOERR) { - /* even if error occurs, we still must participate the collective - call to MPI_File_set_view() */ - *filetype = MPI_BYTE; - } - else if (num_reqs == 1 && disps[0] == 0) { - if (ftypes[0] == MPI_BYTE) - *filetype = MPI_BYTE; - else { - mpireturn = MPI_Type_dup(ftypes[0], filetype); - if (mpireturn != MPI_SUCCESS) - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_dup"); - } - } - else { /* if (num_reqs > 1 || (num_reqs == 1 && disps[0] > 0)) */ - /* all ftypes[] created fine, now concatenate all ftypes[] */ - if (all_ftype_contig) { -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps, - MPI_BYTE, filetype); -#else - mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps, - MPI_BYTE, filetype); -#endif - if (mpireturn != MPI_SUCCESS) - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed"); - else { - MPI_Type_commit(filetype); - err = NC_NOERR; - } - } - else { -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_struct_c(num_reqs, blocklens, disps, - ftypes, filetype); -#else - mpireturn = MPI_Type_create_struct(num_reqs, blocklens, disps, - ftypes, filetype); -#endif - if (mpireturn != MPI_SUCCESS) - err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct"); - else { - MPI_Type_commit(filetype); - err = NC_NOERR; - } - } - - if (err != NC_NOERR) *filetype = MPI_BYTE; - if (status == NC_NOERR) status = err; /* report the first error */ - } - - for (i=0; i--------------------------------------------*/ -/* the input requests, reqs[], are non-interleaving requests */ -static int -construct_buffertypes(NC_lead_req *lead_list, - int num_reqs, -#ifdef HAVE_MPI_LARGE_COUNT - MPI_Count *blocklens, /* [num_reqs] temp buffer */ - MPI_Count *disps, /* [num_reqs] temp buffer */ -#else - int *blocklens, /* [num_reqs] temp buffer */ - MPI_Aint *disps, /* [num_reqs] temp buffer */ -#endif - NC_req *reqs, /* [num_reqs] */ - MPI_Datatype *buf_type) /* OUT */ -{ - int i, j, k, status=NC_NOERR, mpireturn; - MPI_Aint a0, ai; - - *buf_type = MPI_BYTE; - if (num_reqs == 0) return NC_NOERR; - - /* create the I/O buffer derived data type */ - - /* calculate blocklens[], and disps[] */ - for (i=0, j=0; iflag, NC_REQ_SKIP)) continue; - - req_size = lead->varp->xsz; - if (lead->varp->ndims > 0) { /* non-scalar variable */ - MPI_Offset *count = reqs[i].start + lead->varp->ndims; - if (!IS_RECVAR(lead->varp)) req_size *= count[0]; - for (k=1; kvarp->ndims; k++) req_size *= count[k]; - } - -#ifdef HAVE_MPI_LARGE_COUNT - blocklens[j] = req_size; -#else - /* check int overflow */ - if (req_size > NC_MAX_INT) { /* skip this request */ - fSet(lead->flag, NC_REQ_SKIP); - DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW) - continue; - } - blocklens[j] = (int)req_size; -#endif - - MPI_Get_address(reqs[i].xbuf, &ai); - if (j == 0) a0 = ai; - disps[j] = MPI_Aint_diff(ai, a0); - j++; - } - /* update num_reqs to number of valid requests */ - num_reqs = j; - - if (num_reqs > 0) { - /* concatenate buffer addresses into a single buffer type */ -#ifdef HAVE_MPI_LARGE_COUNT - mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps, - MPI_BYTE, buf_type); -#else - mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps, - MPI_BYTE, buf_type); -#endif - if (mpireturn != MPI_SUCCESS) { - int err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); - /* return the first encountered error if there is any */ - if (status == NC_NOERR) status = err; - } - else - MPI_Type_commit(buf_type); - } - - return status; -} - -/*----< extract_reqs() >-----------------------------------------------------*/ -/* extract requests from the queues into new queues to be committed. - * Input value of num_reqs can be NC_REQ_ALL, NC_GET_REQ_ALL, or NC_PUT_REQ_ALL - */ -static int -extract_reqs(NC *ncp, - int num_reqs, - int *req_ids, /* IN: [num_reqs] or NULL */ - int *statuses, /* IN: [num_reqs] or NULL */ - int *num_r_lead_reqs, /* OUT: no. lead get requests */ - int *num_r_reqs, /* OUT: no. non-lead get requests */ - NC_req **get_list, /* OUT: extracted get requests */ - int *num_w_lead_reqs, /* OUT: no. lead put requests */ - int *num_w_reqs, /* OUT: no. non-lead put requests */ - NC_req **put_list) /* OUT: extracted put requests */ -{ - int i, j, status=NC_NOERR; - NC_req *put_list_ptr, *get_list_ptr; - - *num_r_lead_reqs = 0; - *num_w_lead_reqs = 0; - *num_r_reqs = 0; - *num_w_reqs = 0; - - if (num_reqs == NC_PUT_REQ_ALL || num_reqs == NC_REQ_ALL) { - /* the entire put requests */ - for (i=0; inumLeadPutReqs; i++) - fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); - - *num_w_lead_reqs = ncp->numLeadPutReqs; - *num_w_reqs = ncp->numPutReqs; - *put_list = ncp->put_list; - ncp->numPutReqs = 0; - ncp->put_list = NULL; - } - if (num_reqs == NC_GET_REQ_ALL || num_reqs == NC_REQ_ALL) { - /* the entire get requests */ - for (i=0; inumLeadGetReqs; i++) - fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); - - *num_r_lead_reqs = ncp->numLeadGetReqs; - *num_r_reqs = ncp->numGetReqs; - *get_list = ncp->get_list; - ncp->numGetReqs = 0; - ncp->get_list = NULL; - } - if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL || - num_reqs == NC_PUT_REQ_ALL) - return NC_NOERR; - - if (ncp->numGetReqs == 0 && num_reqs == ncp->numLeadPutReqs) { - /* this is the same as NC_PUT_REQ_ALL */ - for (i=0; inumLeadPutReqs; i++) { - ncp->put_lead_list[i].status = statuses + i; - statuses[i] = NC_NOERR; - } - } - for (i=0; inumLeadPutReqs; i++) - fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); - - *num_w_lead_reqs = ncp->numLeadPutReqs; - *num_w_reqs = ncp->numPutReqs; - *put_list = ncp->put_list; - ncp->numPutReqs = 0; - ncp->put_list = NULL; - return NC_NOERR; - } - if (ncp->numPutReqs == 0 && num_reqs == ncp->numLeadGetReqs) { - /* this is the same as NC_GET_REQ_ALL */ - for (i=0; inumLeadGetReqs; i++) { - ncp->get_lead_list[i].status = statuses + i; - statuses[i] = NC_NOERR; - } - } - for (i=0; inumLeadGetReqs; i++) - fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); - - *num_r_lead_reqs = ncp->numLeadGetReqs; - *num_r_reqs = ncp->numGetReqs; - *get_list = ncp->get_list; - ncp->numGetReqs = 0; - ncp->get_list = NULL; - return NC_NOERR; - } - if (num_reqs == ncp->numLeadPutReqs + ncp->numLeadGetReqs && - statuses == NULL) { - /* this is the same as NC_REQ_ALL */ - for (i=0; inumLeadGetReqs; i++) - fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE); - *num_w_lead_reqs = ncp->numLeadPutReqs; - *num_w_reqs = ncp->numPutReqs; - *put_list = ncp->put_list; - ncp->numPutReqs = 0; - ncp->put_list = NULL; - - for (i=0; inumLeadPutReqs; i++) - fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); - *num_r_lead_reqs = ncp->numLeadGetReqs; - *num_r_reqs = ncp->numGetReqs; - *get_list = ncp->get_list; - ncp->numGetReqs = 0; - ncp->get_list = NULL; - return NC_NOERR; + for (i=0; inumLeadPutReqs; i++) + fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE); + *num_r_lead_reqs = ncp->numLeadGetReqs; + *num_r_reqs = ncp->numGetReqs; + *get_list = ncp->get_list; + ncp->numGetReqs = 0; + ncp->get_list = NULL; + return NC_NOERR; } /* requests are a subset of pending requests */ @@ -797,7 +458,7 @@ extract_reqs(NC *ncp, req_ids[i] == ncp->put_lead_list[j].id) { memcpy(put_list_ptr, ncp->put_list + ncp->put_lead_list[j].nonlead_off, - ncp->put_lead_list[j].nonlead_num * sizeof(NC_req)); + sizeof(NC_req) * ncp->put_lead_list[j].nonlead_num); put_list_ptr += ncp->put_lead_list[j].nonlead_num; req_ids[i] = NC_REQ_NULL; break; @@ -810,7 +471,7 @@ extract_reqs(NC *ncp, req_ids[i] == ncp->get_lead_list[j].id) { memcpy(get_list_ptr, ncp->get_list + ncp->get_lead_list[j].nonlead_off, - ncp->get_lead_list[j].nonlead_num * sizeof(NC_req)); + sizeof(NC_req) * ncp->get_lead_list[j].nonlead_num); get_list_ptr += ncp->get_lead_list[j].nonlead_num; req_ids[i] = NC_REQ_NULL; break; @@ -987,30 +648,72 @@ req_commit(NC *ncp, do_write = (num_w_reqs > 0); } +#if 1 /* carry out writes and reads separately (writes first) */ - if (do_write > 0) { + err = ncmpio_ina_nreqs(ncp, NC_REQ_WR, num_w_reqs, put_list, + newnumrecs); + put_list = NULL; /* has been freed in the above call */ + + /* Update the number of records if new records have been created. + * For nonblocking APIs, there is no way for a process to know whether + * others write to a record variable or not. Note newnumrecs has been + * sync-ed and always >= ncp->numrecs. + */ + if (coll_indep == NC_REQ_COLL) { + if (newnumrecs > ncp->numrecs) { + /* update new record number in file. Note newnumrecs is already + * sync-ed among all processes and in collective mode + * ncp->numrecs is always sync-ed in memory among processes, + * thus no need another MPI_Allreduce to sync it. */ + err = ncmpio_write_numrecs(ncp, newnumrecs); + if (status == NC_NOERR) status = err; + /* retain the first error if there is any */ + if (ncp->numrecs < newnumrecs) ncp->numrecs = newnumrecs; + } + } + else { /* NC_REQ_INDEP */ + if (ncp->numrecs < newnumrecs) { + ncp->numrecs = newnumrecs; + set_NC_ndirty(ncp); + /* delay numrecs sync until end_indep, redef or close */ + } + } + } + if (do_read > 0) { + err = ncmpio_ina_nreqs(ncp, NC_REQ_RD, num_r_reqs, get_list, + newnumrecs); + get_list = NULL; /* has been freed in the above call */ + } +#else + if (do_write > 0) { + if (ncp->num_aggrs_per_node > 0 && coll_indep == NC_REQ_COLL) + /* intra-node aggregation must be in collective mode */ + err = ncmpio_intra_node_aggregation_nreqs(ncp, NC_REQ_WR, + num_w_reqs, put_list, + newnumrecs); + else + err = wait_getput(ncp, num_w_reqs, put_list, NC_REQ_WR, coll_indep, + newnumrecs); + put_list = NULL; /* has been freed in wait_getput() */ + } + + if (do_read > 0) { + if (ncp->num_aggrs_per_node > 0 && coll_indep == NC_REQ_COLL) + /* intra-node aggregation must be in collective mode */ + err = ncmpio_intra_node_aggregation_nreqs(ncp, NC_REQ_RD, + num_r_reqs, get_list, + newnumrecs); + else + err = wait_getput(ncp, num_r_reqs, get_list, NC_REQ_RD, coll_indep, + newnumrecs); + get_list = NULL; /* has been freed in wait_getput() */ + } +#endif + + /* retain the first error status */ + if (status == NC_NOERR) status = err; - if (ncp->my_aggr >= 0 && coll_indep == NC_REQ_COLL && ncp->nprocs > 1) - /* intra-node write aggregation must be in collective mode */ - err = ncmpio_intra_node_aggregation_nreqs(ncp, NC_REQ_WR, - num_w_reqs, put_list, - newnumrecs); - else - err = wait_getput(ncp, num_w_reqs, put_list, NC_REQ_WR, coll_indep, - newnumrecs); - put_list = NULL; /* has been freed in wait_getput() */ - } - - if (do_read > 0) { - err = wait_getput(ncp, num_r_reqs, get_list, NC_REQ_RD, coll_indep, - newnumrecs); - get_list = NULL; /* has been freed in wait_getput() */ - } - - /* retain the first error status */ - if (status == NC_NOERR) status = err; - /* post-IO data processing: In write case, we may need to byte-swap user * write buf if it is used as the write buffer in MPI write call and the * target machine is little Endian. For read case, we may need to @@ -1114,137 +817,424 @@ req_commit(NC *ncp, j++; } } - ncp->numLeadGetReqs = j; - if (ncp->numLeadGetReqs == 0) { - NCI_Free(ncp->get_list); - NCI_Free(ncp->get_lead_list); - ncp->get_list = NULL; - ncp->get_lead_list = NULL; - } + ncp->numLeadGetReqs = j; + if (ncp->numLeadGetReqs == 0) { + NCI_Free(ncp->get_list); + NCI_Free(ncp->get_lead_list); + ncp->get_list = NULL; + ncp->get_lead_list = NULL; + } + } + + return status; +} + +/*----< ncmpio_wait() >-------------------------------------------------------*/ +int +ncmpio_wait(void *ncdp, + int num_reqs, + int *req_ids, /* [num_reqs]: IN/OUT */ + int *statuses, /* [num_reqs] */ + int reqMode) /* only check if NC_REQ_COLL or NC_REQ_INDEP */ +{ + NC *ncp = (NC*)ncdp; + int coll_indep; + + if (NC_indef(ncp)) /* wait must be called in data mode */ + DEBUG_RETURN_ERROR(NC_EINDEFINE) + + coll_indep = (fIsSet(reqMode, NC_REQ_INDEP)) ? NC_REQ_INDEP : NC_REQ_COLL; + +#ifdef ENABLE_REQ_AGGREGATION + /* check collective or independent mode */ + if (coll_indep == NC_REQ_INDEP && !NC_indep(ncp)) + DEBUG_RETURN_ERROR(NC_ENOTINDEP) + else if (coll_indep == NC_REQ_COLL && NC_indep(ncp)) + DEBUG_RETURN_ERROR(NC_EINDEP) + + if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR; + + return req_commit(ncp, num_reqs, req_ids, statuses, coll_indep); +#else + /* If request aggregation is disabled, we call an independent wait() for + * each request + */ + int i, status=NC_NOERR, err; + + if (coll_indep == NC_REQ_INDEP) { + /* This is called from ncmpi_wait(), which is an independent call + * Argument num_reqs can be NC_REQ_ALL which means to flush all pending + * nonblocking requests. In this case, arguments req_ids and statuses + * will be ignored. + * Argument num_reqs must either be NC_REQ_ALL, NC_GET_REQ_ALL, + * NC_PUT_REQ_ALL, or a non-negative value. + * Argument statuses can be NULL, meaning the caller only cares about + * the error code returned by this call, but not the statuses of + * individual nonblocking requests. + */ + if (num_reqs == 0) return NC_NOERR; + + /* This is called from ncmpi_wait which must be called in independent + * data mode, illegal in collective mode. + */ + if (!NC_indep(ncp)) DEBUG_RETURN_ERROR(NC_ENOTINDEP); + + if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR; + } + else { + /* This is called from ncmpi_wait_all(), which is a collective call + * Argument num_reqs can be NC_REQ_ALL which means to flush all pending + * nonblocking requests. In this case, arguments req_ids and statuses + * will be ignored. + * Argument num_reqs must either be NC_REQ_ALL, NC_GET_REQ_ALL, + * NC_PUT_REQ_ALL, or a non-negative value. + * Argument statuses can be NULL, meaning the caller only cares about + * the error code returned by this call, but not the statuses of + * individual nonblocking requests. + */ + /* the following line CANNOT be added, because ncmpi_wait_all() is a + * collective call, all processes must participate some MPI collective + * operations used later on. + */ + /* if (num_reqs == 0) return NC_NOERR; */ + + /* This is called from ncmpi_wait_all which must be called in + * collective data mode, illegal in independent mode. This also + * ensures the program will returns back to collective mode. + */ + if (NC_indep(ncp)) DEBUG_RETURN_ERROR(NC_EINDEP); + + /* must enter independent mode, as num_reqs may be different among + processes */ + err = ncmpio_begin_indep_data(ncp); + if (status == NC_NOERR) status = err; + } + + if (num_reqs <= NC_REQ_ALL) { /* flush all get or put pending requests */ + if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL) { + while (ncp->numLeadGetReqs) { + /* commit one request at a time. Note ncp->numLeadGetReqs + * will be descreased in req_commit() + */ + err = req_commit(ncp, 1, &ncp->get_lead_list[0].id, NULL, + NC_REQ_INDEP); + if (status == NC_NOERR) status = err; + } + } + if (num_reqs == NC_REQ_ALL || num_reqs == NC_PUT_REQ_ALL) { + while (ncp->numLeadPutReqs) { + /* commit one request at a time. Note ncp->numLeadPutReqs + * will be descreased in req_commit() + */ + err = req_commit(ncp, 1, &ncp->put_lead_list[0].id, NULL, + NC_REQ_INDEP); + if (status == NC_NOERR) status = err; + } + } + } + else { + for (i=0; i----------------------------------------------*/ +/* concatenate the requests into a single MPI derived filetype */ +static int +construct_filetypes(NC *ncp, + NC_lead_req *lead_list, /* NC_REQ_WR or NC_REQ_RD */ + int num_reqs, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *blocklens, /* [num_reqs] temp buffer */ + MPI_Count *disps, /* [num_reqs] temp buffer */ +#else + int *blocklens, /* [num_reqs] temp buffer */ + MPI_Aint *disps, /* [num_reqs] temp buffer */ +#endif + NC_req *reqs, /* [num_reqs] */ + MPI_Datatype *filetype) /* OUT */ +{ + int i, j, err, status=NC_NOERR, all_ftype_contig=1, last_contig_req; + int mpireturn; + MPI_Datatype *ftypes; + + if (num_reqs <= 0) { /* for participating collective call */ + *filetype = MPI_BYTE; + return NC_NOERR;; + } + + /* hereinafter, num_reqs > 0 */ + ftypes = (MPI_Datatype*) NCI_Malloc(sizeof(MPI_Datatype) * num_reqs); + + /* create a filetype for each request */ + last_contig_req = -1; /* index of the last contiguous request */ + j = 0; /* index of last valid ftypes */ + for (i=0; ivarp->ndims; + + ftypes[j] = MPI_BYTE; /* in case the call below failed */ + + if (ndims == 0) { /* scalar variable */ +#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET + if (lead->varp->begin > NC_MAX_INT) { + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) + fSet(lead->flag, NC_REQ_SKIP); /* skip this request */ + if ( lead->status != NULL && + *lead->status == NC_NOERR) + *lead->status = err; + if (status == NC_NOERR) + status = err; /* report first error */ + } +#endif + disps[j] = lead->varp->begin; + is_ftype_contig = 1; + } + else if (reqs[i].npairs == 1) { /* only one offset-length pair */ + /* reqs[i].offset_start has been set back in wait_getput() */ + disps[j] = reqs[i].offset_start; + is_ftype_contig = 1; + } + else { /* non-scalar variable with more offset-length pairs */ + MPI_Offset offset, *count, *stride; + count = reqs[i].start + ndims; + stride = fIsSet(lead->flag, NC_REQ_STRIDE_NULL) ? + NULL : count + ndims; + + err = ncmpio_filetype_create_vars(ncp, + lead->varp, + reqs[i].start, + count, + stride, + &offset, + &ftypes[j], + &is_ftype_contig); + +#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET + if (err == NC_NOERR && offset > NC_MAX_INT) + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) +#endif + disps[j] = (MPI_Aint)offset; + + if (err != NC_NOERR) { + fSet(lead->flag, NC_REQ_SKIP); /* skip this request */ + if ( lead->status != NULL && + *lead->status == NC_NOERR) + *lead->status = err; + if (status == NC_NOERR) status = err; /* report first error */ + continue; + } + } + + if (is_ftype_contig) { + MPI_Offset coalesced_len; + + /* No need to construct a filetype */ + coalesced_len = lead->varp->xsz * reqs[i].nelems; + +#ifdef HAVE_MPI_LARGE_COUNT + blocklens[j] = coalesced_len; +#else + if (coalesced_len > NC_MAX_INT) { + DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW) + if (status == NC_NOERR) + status = err; /* report first error */ + coalesced_len = 0; + } + blocklens[j] = (int)coalesced_len; +#endif + if (last_contig_req >= 0) + coalesced_len += blocklens[last_contig_req]; +#ifdef HAVE_MPI_LARGE_COUNT + if (last_contig_req >= 0 && + disps[j] - disps[last_contig_req] == + blocklens[last_contig_req]) { + blocklens[last_contig_req] = coalesced_len; + j--; + } + else last_contig_req = j; +#else + /* if coalesced_len overflows 4-byte int, then skip coalescing */ + if (coalesced_len < NC_MAX_INT && last_contig_req >= 0 && + disps[j] - disps[last_contig_req] == + blocklens[last_contig_req]) { + blocklens[last_contig_req] = (int)coalesced_len; + j--; + } + else last_contig_req = j; +#endif + } + else { + /* we will construct a filetype, set blocklen to 1 */ + blocklens[j] = 1; + last_contig_req = -1; + all_ftype_contig = 0; + } + } + /* j is the new num_reqs */ + num_reqs = j; + + if (status != NC_NOERR) { + /* even if error occurs, we still must participate the collective + call to MPI_File_set_view() */ + *filetype = MPI_BYTE; + } + else if (num_reqs == 1 && disps[0] == 0) { + if (ftypes[0] == MPI_BYTE) + *filetype = MPI_BYTE; + else { + mpireturn = MPI_Type_dup(ftypes[0], filetype); + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_dup"); + } + } + else { /* if (num_reqs > 1 || (num_reqs == 1 && disps[0] > 0)) */ + /* all ftypes[] created fine, now concatenate all ftypes[] */ + if (all_ftype_contig) { +#ifdef HAVE_MPI_LARGE_COUNT + mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps, + MPI_BYTE, filetype); +#else + mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps, + MPI_BYTE, filetype); +#endif + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed"); + else { + MPI_Type_commit(filetype); + err = NC_NOERR; + } + } + else { +#ifdef HAVE_MPI_LARGE_COUNT + mpireturn = MPI_Type_create_struct_c(num_reqs, blocklens, disps, + ftypes, filetype); +#else + mpireturn = MPI_Type_create_struct(num_reqs, blocklens, disps, + ftypes, filetype); +#endif + if (mpireturn != MPI_SUCCESS) + err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct"); + else { + MPI_Type_commit(filetype); + err = NC_NOERR; + } + } + + if (err != NC_NOERR) *filetype = MPI_BYTE; + if (status == NC_NOERR) status = err; /* report the first error */ + } + + for (i=0; i-------------------------------------------------------*/ -int -ncmpio_wait(void *ncdp, - int num_reqs, - int *req_ids, /* [num_reqs]: IN/OUT */ - int *statuses, /* [num_reqs] */ - int reqMode) /* only check if NC_REQ_COLL or NC_REQ_INDEP */ +/*----< construct_buffertypes() >--------------------------------------------*/ +/* the input requests, reqs[], are non-interleaving requests */ +static int +construct_buffertypes(NC_lead_req *lead_list, + int num_reqs, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *blocklens, /* [num_reqs] temp buffer */ + MPI_Count *disps, /* [num_reqs] temp buffer */ +#else + int *blocklens, /* [num_reqs] temp buffer */ + MPI_Aint *disps, /* [num_reqs] temp buffer */ +#endif + NC_req *reqs, /* [num_reqs] */ + MPI_Datatype *buf_type) /* OUT */ { - NC *ncp = (NC*)ncdp; - int coll_indep; - - if (NC_indef(ncp)) /* wait must be called in data mode */ - DEBUG_RETURN_ERROR(NC_EINDEFINE) - - coll_indep = (fIsSet(reqMode, NC_REQ_INDEP)) ? NC_REQ_INDEP : NC_REQ_COLL; + int i, j, k, status=NC_NOERR, mpireturn; + MPI_Aint a0, ai; -#ifdef ENABLE_REQ_AGGREGATION - /* check collective or independent mode */ - if (coll_indep == NC_REQ_INDEP && !NC_indep(ncp)) - DEBUG_RETURN_ERROR(NC_ENOTINDEP) - else if (coll_indep == NC_REQ_COLL && NC_indep(ncp)) - DEBUG_RETURN_ERROR(NC_EINDEP) + *buf_type = MPI_BYTE; + if (num_reqs == 0) return NC_NOERR; - if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR; + /* create the I/O buffer derived data type */ - return req_commit(ncp, num_reqs, req_ids, statuses, coll_indep); -#else - /* If request aggregation is disabled, we call an independent wait() for - * each request - */ - int i, status=NC_NOERR, err; + /* calculate blocklens[], and disps[] */ + for (i=0, j=0; iflag, NC_REQ_SKIP)) continue; - if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR; - } - else { - /* This is called from ncmpi_wait_all(), which is a collective call - * Argument num_reqs can be NC_REQ_ALL which means to flush all pending - * nonblocking requests. In this case, arguments req_ids and statuses - * will be ignored. - * Argument num_reqs must either be NC_REQ_ALL, NC_GET_REQ_ALL, - * NC_PUT_REQ_ALL, or a non-negative value. - * Argument statuses can be NULL, meaning the caller only cares about - * the error code returned by this call, but not the statuses of - * individual nonblocking requests. - */ - /* the following line CANNOT be added, because ncmpi_wait_all() is a - * collective call, all processes must participate some MPI collective - * operations used later on. - */ - /* if (num_reqs == 0) return NC_NOERR; */ + req_size = lead->varp->xsz; + if (lead->varp->ndims > 0) { /* non-scalar variable */ + MPI_Offset *count = reqs[i].start + lead->varp->ndims; + if (!IS_RECVAR(lead->varp)) req_size *= count[0]; + for (k=1; kvarp->ndims; k++) req_size *= count[k]; + } - /* This is called from ncmpi_wait_all which must be called in - * collective data mode, illegal in independent mode. This also - * ensures the program will returns back to collective mode. - */ - if (NC_indep(ncp)) DEBUG_RETURN_ERROR(NC_EINDEP); +#ifdef HAVE_MPI_LARGE_COUNT + blocklens[j] = req_size; +#else + /* check int overflow */ + if (req_size > NC_MAX_INT) { /* skip this request */ + fSet(lead->flag, NC_REQ_SKIP); + DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW) + continue; + } + blocklens[j] = (int)req_size; +#endif - /* must enter independent mode, as num_reqs may be different among - processes */ - err = ncmpio_begin_indep_data(ncp); - if (status == NC_NOERR) status = err; + MPI_Get_address(reqs[i].xbuf, &ai); + if (j == 0) a0 = ai; + disps[j] = MPI_Aint_diff(ai, a0); + j++; } + /* update num_reqs to number of valid requests */ + num_reqs = j; - if (num_reqs <= NC_REQ_ALL) { /* flush all get or put pending requests */ - if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL) { - while (ncp->numLeadGetReqs) { - /* commit one request at a time. Note ncp->numLeadGetReqs - * will be descreased in req_commit() - */ - err = req_commit(ncp, 1, &ncp->get_lead_list[0].id, NULL, - NC_REQ_INDEP); - if (status == NC_NOERR) status = err; - } - } - if (num_reqs == NC_REQ_ALL || num_reqs == NC_PUT_REQ_ALL) { - while (ncp->numLeadPutReqs) { - /* commit one request at a time. Note ncp->numLeadPutReqs - * will be descreased in req_commit() - */ - err = req_commit(ncp, 1, &ncp->put_lead_list[0].id, NULL, - NC_REQ_INDEP); - if (status == NC_NOERR) status = err; - } - } - } - else { - for (i=0; i 0) { + /* concatenate buffer addresses into a single buffer type */ +#ifdef HAVE_MPI_LARGE_COUNT + mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps, + MPI_BYTE, buf_type); +#else + mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps, + MPI_BYTE, buf_type); +#endif + if (mpireturn != MPI_SUCCESS) { + int err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed"); + /* return the first encountered error if there is any */ if (status == NC_NOERR) status = err; } + else + MPI_Type_commit(buf_type); } - if (coll_indep == NC_REQ_COLL) { - /* return to collective data mode */ - err = ncmpio_end_indep_data(ncp); - if (status == NC_NOERR) status = err; - } - - return status; /* return the first error encountered, if there is any */ -#endif + return status; } /* C struct for breaking down a request to a list of offset-length segments */ @@ -1381,8 +1371,8 @@ merge_requests(NC *ncp, MPI_Offset *nsegs, /* OUT: no. off-len pairs */ off_len **segs) /* OUT: [*nsegs] */ { - int i, j, status=NC_NOERR, ndims; - MPI_Offset nseg, *start, *count, *shape, *stride; + int i, j, status=NC_NOERR, ndims, is_incr; + MPI_Offset nseg, *start, *count, *shape, *stride, prev_offset; MPI_Aint addr, buf_addr; *nsegs = 0; /* total number of offset-length pairs */ @@ -1397,43 +1387,18 @@ merge_requests(NC *ncp, /* Count the number off-len pairs from reqs[], so we can malloc a * contiguous memory space for storing off-len pairs */ - for (i=0; ivarp->ndims; - if (ndims > 0) { - start = reqs[i].start; - count = start + ndims; - stride = count + ndims; - } - else - start = count = stride = NULL; - - /* for record variable, each reqs[] is within a record */ - if (IS_RECVAR(lead->varp)) { - ndims--; - start++; - count++; - stride++; - } - if (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) stride = NULL; - - if (ndims < 0) continue; - if (ndims == 0) { /* 1D record variable */ - (*nsegs)++; - continue; - } - nseg = 1; - if (stride != NULL && stride[ndims-1] > 1) - nseg = count[ndims-1]; /* count of last dimension */ - for (j=0; joff = reqs[i].offset_start; + seg_ptr->len = reqs[i].nelems * lead->varp->xsz; + seg_ptr->buf_addr = addr; + if (prev_offset > seg_ptr->off) + is_incr = 0; /* offsets are not incrementing */ + else + prev_offset = seg_ptr->off; + seg_ptr++; + continue; + } + ndims = lead->varp->ndims; if (ndims > 0) { start = reqs[i].start; @@ -1476,15 +1454,18 @@ merge_requests(NC *ncp, addr, start, count, stride, &nseg, /* OUT: number of offset-length pairs */ seg_ptr); /* OUT: array of offset-length pairs */ + + /* check if (*segs)[].off are in an increasing order */ + for (j=0; j seg_ptr[j].off) + is_incr = 0; /* offsets are not incrementing */ + else + prev_offset = seg_ptr[j].off; + } seg_ptr += nseg; /* append the list to the end of segs array */ } - /* check if (*segs)[].off are in an increasing order */ - for (i=1; i<*nsegs; i++) { - if ((*segs)[i-1].off > (*segs)[i].off) - break; - } - if (i < *nsegs) /* not in an increasing order */ + if (!is_incr) /* not in an increasing order */ /* sort the off-len array, segs[], in an increasing order */ qsort(*segs, (size_t)(*nsegs), sizeof(off_len), off_compare); @@ -1751,8 +1732,7 @@ req_aggregation(NC *ncp, void *buf; /* point to starting buffer, used by MPI-IO call */ MPI_Aint b_begin, b_addr; MPI_Datatype filetype, buf_type, *ftypes, *btypes; - MPI_File fh; - MPI_Offset max_end, offset; + MPI_Offset max_end; if (num_reqs == 0) { /* only NC_REQ_COLL can reach here for 0 request */ assert(coll_indep == NC_REQ_COLL); @@ -2064,13 +2044,8 @@ req_aggregation(NC *ncp, } NCI_Free(reqs); - fh = ncp->independent_fh; - if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) - fh = ncp->collective_fh; - /* set the MPI-IO fileview, this is a collective call */ - offset = 0; - err = ncmpio_file_set_view(ncp, fh, &offset, filetype); + err = ncmpio_file_set_view(ncp, 0, filetype, 0, NULL, NULL); if (filetype != MPI_BYTE) MPI_Type_free(&filetype); if (err != NC_NOERR) { if (status == NC_NOERR) status = err; @@ -2079,112 +2054,25 @@ req_aggregation(NC *ncp, } /* call MPI_File_read_at_all/MPI_File_write_at_all */ - err = ncmpio_read_write(ncp, rw_flag, coll_indep, offset, buf_len, buf_type, - buf, ((buf_type == MPI_BYTE) ? 1 : 0)); + // err = ncmpio_read_write(ncp, rw_flag, 0, buf_len, buf_type, buf); + +assert(0); +/* This subroutine is no longer used. + PNCIO_View buf_view; + err = ncmpio_read_write(ncp, rw_flag, 0, buf_view, buf); +*/ + if (status == NC_NOERR) status = err; if (buf_type != MPI_BYTE) MPI_Type_free(&buf_type); /* No longer need to reset the file view, as the root's fileview includes * the whole file header. - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", - MPI_INFO_NULL)); */ return status; } -/*----< calculate_access_range() >-------------------------------------------*/ -/* Returns the file offsets of access range of this request: starting file - * offset and end offset (exclusive). - * Note zero-length request should never call this subroutine. - */ -static int -calculate_access_range(const NC *ncp, - const NC_var *varp, - const MPI_Offset *start, /* [varp->ndims] */ - const MPI_Offset *count, /* [varp->ndims] */ - const MPI_Offset *stride, /* [varp->ndims] */ - MPI_Offset *start_off, /* OUT: start offset */ - MPI_Offset *end_off) /* OUT: end offset */ -{ - int i, ndims = varp->ndims; /* number of dimensions of this variable */ - - /* - * varp->dsizes[] is computed from right to left product of shape - * For example, a 3D array of size 5x4x3 in C order, - * For fixed-size variable: dsizes[0]=60 dsizes[1]=12 dsizes[2]=3 - * For record variable: dsizes[0]=12 dsizes[1]=12 dsizes[2]=3 - */ - if (IS_RECVAR(varp)) { - *start_off = 0; - *end_off = 0; - if (stride == NULL) { - if (ndims > 1) { - /* least significant dimension */ - *start_off = start[ndims-1]; - *end_off = start[ndims-1]+(count[ndims-1]-1); - /* the remaining dimensions */ - for (i=ndims-2; i>0; i--) { - *start_off += start[i]*varp->dsizes[i+1]; - *end_off += (start[i]+(count[i]-1))*varp->dsizes[i+1]; - } - } - *start_off *= varp->xsz; /* offset in bytes */ - *end_off *= varp->xsz; - /* handle the unlimited, most significant dimension */ - *start_off += start[0] * ncp->recsize; - *end_off += (start[0]+(count[0]-1)) * ncp->recsize; - } - else { - if (ndims > 1) { - /* least significant dimension */ - *start_off = start[ndims-1]; - *end_off = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1]; - /* the remaining dimensions */ - for (i=ndims-2; i>0; i--) { - *start_off += start[i]*varp->dsizes[i+1]; - *end_off += (start[i]+(count[i]-1)*stride[i]) * - varp->dsizes[i+1]; - } - } - *start_off *= varp->xsz; /* offset in bytes */ - *end_off *= varp->xsz; - /* handle the unlimited, most significant dimension */ - *start_off += start[0] * ncp->recsize; - *end_off += (start[0]+(count[0]-1)*stride[0]) * ncp->recsize; - } - } - else { - if (stride == NULL) { - /* first handle the least significant dimension */ - *start_off = start[ndims-1]; - *end_off = start[ndims-1] + (count[ndims-1]-1); - /* remaining dimensions till the most significant dimension */ - for (i=ndims-2; i>=0; i--) { - *start_off += start[i] * varp->dsizes[i+1]; - *end_off += (start[i]+(count[i]-1)) * varp->dsizes[i+1]; - } - } - else { - /* first handle the least significant dimension */ - *start_off = start[ndims-1]; - *end_off = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1]; - /* remaining dimensions till the most significant dimension */ - for (i=ndims-2; i>=0; i--) { - *start_off += start[i] * varp->dsizes[i+1]; - *end_off += (start[i]+(count[i]-1)*stride[i])*varp->dsizes[i+1]; - } - } - *start_off *= varp->xsz; /* offset in bytes */ - *end_off *= varp->xsz; - } - *start_off += varp->begin; /* beginning file offset of this variable */ - *end_off += varp->begin + varp->xsz; - - return NC_NOERR; -} - /*----< wait_getput() >------------------------------------------------------*/ static int wait_getput(NC *ncp, @@ -2210,8 +2098,17 @@ wait_getput(NC *ncp, varp = lead->varp; if (varp->ndims == 0) { /* scalar variable */ - reqs[i].offset_start = varp->begin; - reqs[i].offset_end = varp->begin + varp->xsz; + reqs[i].offset_start += varp->begin; + reqs[i].offset_end += varp->begin; + } + else if (reqs[i].npairs == 1) { /* only one offset-length pair */ + /* reqs[i].offset_end == reqs[i].nelems * varp->xsz */ + MPI_Offset off = varp->begin; + + if (IS_RECVAR(varp)) off += reqs[i].start[0] * ncp->recsize; + + reqs[i].offset_start += off; + reqs[i].offset_end += off; } else { /* start/count/stride have been allocated in a contiguous array */ @@ -2221,8 +2118,8 @@ wait_getput(NC *ncp, count + varp->ndims; /* calculate access range of this request */ - calculate_access_range(ncp, varp, reqs[i].start, count, stride, - &reqs[i].offset_start, &reqs[i].offset_end); + ncmpio_calc_start_end(ncp, varp, reqs[i].start, count, stride, + &reqs[i].offset_start, &reqs[i].offset_end); } if (i > 0) { /* check if offset_start are in a monotonic nondecreasing order */ @@ -2304,8 +2201,7 @@ mgetput(NC *ncp, void *buf=NULL; NC_lead_req *lead_list; MPI_Datatype filetype, buf_type=MPI_BYTE; - MPI_Offset offset=0, buf_count=0; - MPI_File fh; + MPI_Offset buf_count=0; #ifdef HAVE_MPI_LARGE_COUNT MPI_Count *blocklens; @@ -2489,12 +2385,8 @@ mgetput(NC *ncp, mpi_io: NCI_Free(reqs); - fh = ncp->independent_fh; - if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) - fh = ncp->collective_fh; - /* set the MPI-IO fileview, this is a collective call */ - err = ncmpio_file_set_view(ncp, fh, &offset, filetype); + err = ncmpio_file_set_view(ncp, 0, filetype, 0, NULL, NULL); if (filetype != MPI_BYTE) MPI_Type_free(&filetype); if (err != NC_NOERR) { if (status == NC_NOERR) status = err; @@ -2503,17 +2395,19 @@ mgetput(NC *ncp, } /* call MPI_File_read_at_all/MPI_File_write_at_all */ - err = ncmpio_read_write(ncp, rw_flag, coll_indep, offset, buf_count, - buf_type, buf, ((buf_type == MPI_BYTE) ? 1 : 0)); + // err = ncmpio_read_write(ncp, rw_flag, 0, buf_count, buf_type, buf); + assert(0); + PNCIO_View buf_view; + err = ncmpio_read_write(ncp, rw_flag, 0, buf_view, buf); if (status == NC_NOERR) status = err; if (buf_type != MPI_BYTE) MPI_Type_free(&buf_type); /* No longer need to reset the file view, as the root's fileview includes * the whole file header. - TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", - MPI_INFO_NULL)); */ return status; } +#endif + diff --git a/src/drivers/pncio/Makefile.am b/src/drivers/pncio/Makefile.am new file mode 100644 index 000000000..f5527c8d5 --- /dev/null +++ b/src/drivers/pncio/Makefile.am @@ -0,0 +1,51 @@ +# +# Copyright (C) 2025, Northwestern University and Argonne National Laboratory +# See COPYRIGHT notice in top-level directory. +# +# @configure_input@ + +SUFFIXES = .a .o .c .m4 .h + +AM_CPPFLAGS = -I${top_srcdir}/src/include +AM_CPPFLAGS += -I${top_builddir}/src/include +AM_CPPFLAGS += -I${top_srcdir}/src/drivers/include +AM_CPPFLAGS += -I${top_builddir}/src/drivers/include + +if PNETCDF_DEBUG + AM_CPPFLAGS += -DPNETCDF_DEBUG +endif + +noinst_LTLIBRARIES = libpncio.la + +H_SRCS = pncio.h + +C_SRCS = pncio_read.c \ + pncio_write.c \ + pncio_open.c \ + pncio_close.c \ + pncio_fstype.c \ + pncio_aggregate.c \ + pncio_read_str.c \ + pncio_read_coll.c \ + pncio_read_str_naive.c \ + pncio_write_coll.c \ + pncio_write_str.c \ + pncio_write_str_naive.c \ + pncio_utils.c \ + pncio_lustre_open.c \ + pncio_lustre_wrcoll.c \ + pncio_lustre_wrstr.c \ + pncio_lock.c \ + pncio_set_size.c \ + pncio_sync.c \ + pncio_delete.c \ + pncio_set_view.c \ + pncio_hints.c + + +libpncio_la_SOURCES = $(C_SRCS) $(H_SRCS) + +CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out + +tests-local: all + diff --git a/src/drivers/pncio/pncio.h b/src/drivers/pncio/pncio.h new file mode 100644 index 000000000..035febd79 --- /dev/null +++ b/src/drivers/pncio/pncio.h @@ -0,0 +1,354 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifndef H_PNCIO +#define H_PNCIO + +#include +#include +#include +#include /* pwrite() */ + +#include +#include /* memcpy() */ +#include /* size_t */ +#include /* off_t */ +#include +#ifdef HAVE_LIMITS_H +#include +#endif +#ifdef HAVE_FCNTL_H +#include +#endif +#define FDTYPE int + +#include +#include + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +#define NMEASURES 8 +#endif + +#define PNCIO_LOCKS 300 /* file system supports fcntl()-style locking */ +#define PNCIO_Feature(a, b) ((b == PNCIO_LOCKS) ? 1 : 0) + +#if defined(F_SETLKW64) +#define PNCIO_UNLOCK(fd, offset, whence, len) \ + PNCIO_GEN_SetLock64(fd, F_SETLK, F_UNLCK, offset, whence, len) +#define PNCIO_WRITE_LOCK(fd, offset, whence, len) \ + PNCIO_GEN_SetLock64(fd, F_SETLKW, F_WRLCK, offset, whence, len) +#else +#define PNCIO_UNLOCK(fd, offset, whence, len) \ + PNCIO_GEN_SetLock(fd, F_SETLK, F_UNLCK, offset, whence, len) +#define PNCIO_WRITE_LOCK(fd, offset, whence, len) \ + PNCIO_GEN_SetLock(fd, F_SETLKW, F_WRLCK, offset, whence, len) +#endif + + +#define PNCIO_PERM 0666 /* file creation permission mask */ + +#define PNCIO_UFS 152 /* Unix file system */ +#define PNCIO_LUSTRE 163 /* Lustre */ +#define PNCIO_FSTYPE_MPIIO -1 /* Use MPI-IO */ +#define PNCIO_FSTYPE_CHECK 0 /* Use PnetCDF PNCIO drivers */ + +#define PNCIO_LUSTRE_MAX_OSTS 256 /* Maximum number of Lustre OSTs if hint + * striping_factor is not set by user. + */ + +#define PNCIO_CB_BUFFER_SIZE_DFLT "16777216" +#define PNCIO_IND_RD_BUFFER_SIZE_DFLT "4194304" +#define PNCIO_IND_WR_BUFFER_SIZE_DFLT "524288" +#define PNCIO_CB_CONFIG_LIST_DFLT "*:1" + +/* PNCIO_DS_WR_NPAIRS_LB is the lower bound of the total number of + * offset-length pairs over the non-aggregator senders to be received by an + * I/O aggregator to skip the potentially expensive heap-merge sort that + * determines whether or not data sieving write is necessary. + * PNCIO_DS_WR_NAGGRS_LB is the lower bound of the number of non-aggregators + * sending their offset-length pairs to an I/O aggregator. + * Both conditions must be met to skip the heap-merge sort. + * + * When data sieving is enabled, read-modify-write will perform at each round + * of two-phase I/O at each aggregator. The following describes whether + * detecting "holes" in a write region is necessary, depending on the data + * sieving hint, romio_ds_write, is set to enable/disable/automatic. + * + automatic - We need to check whether holes exist. If holes exist, the + * "read-modify" part must run. If not, "read-modify" can be skipped. + * + enable - "read-modify" part must perform, skip hole checking, and thus + * skip the heap-merge sort. + * + disable - "read-modify" part must skip, need not check holes, but must + * construct srt_off_len to merge all others_req[] into a single sorted + * list, which requires to call a heap-merge sort. This step is necessary + * because write data from all non-aggregators are received into the same + * write_buf, with a possibility of overlaps, and srt_off_len stores the + * coalesced offset-length pairs of individual non-contiguous write + * request and will be used to write them to the file. + * + * Heap-merge sort merges offset-length pairs received from all non-aggregators + * into a single list, which can be expensive. Its cost can be even larger than + * the cost of "read" in "read-modify-write". Below two constants are the lower + * bounds used to determine whether or not to perform such sorting, when data + * sieving is set to the automatic mode. + */ +#define PNCIO_DS_WR_NPAIRS_LB 8192 +#define PNCIO_DS_WR_NAGGRS_LB 256 +#define DO_HEAP_MERGE(nrecv, npairs) ((nrecv) > PNCIO_DS_WR_NAGGRS_LB || (npairs) > PNCIO_DS_WR_NPAIRS_LB) + +#define PNCIO_TYPE_DECREASE 0x00000001 /* if not monotonic nondecreasing */ +#define PNCIO_TYPE_OVERLAP 0x00000002 /* if contains overlapping regions */ +#define PNCIO_TYPE_NEGATIVE 0x00000004 /* if one of displacements is negative */ + +enum { + PNCIO_HINT_AUTO = 0, + PNCIO_HINT_ENABLE = 1, + PNCIO_HINT_DISABLE = 2 +}; + +typedef struct { + int striping_factor; + int striping_unit; + int cb_read; + int cb_write; + int cb_nodes; + int cb_buffer_size; + int ds_read; + int ds_write; + int no_indep_rw; + int ind_rd_buffer_size; + int ind_wr_buffer_size; + int start_iodevice; + int *ranklist; + + union { + struct { + int num_osts; + int overstriping_ratio; + } lustre; + } fs_hints; +} PNCIO_Hints; + +typedef struct { + MPI_Datatype type; /* MPI derived datatype */ + MPI_Offset size; /* total size in bytes (sum of len[*]) */ + MPI_Count count; /* number of off-len pairs */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *off; /* [count] byte offsets */ + MPI_Offset *len; /* [count] block lengths in bytes */ +#else + MPI_Offset *off; /* [count] byte offsets */ + int *len; /* [count] block lengths in bytes */ +#endif + MPI_Count idx; /* index of off-len pairs consumed so far */ + MPI_Aint rem; /* remaining amount in the pair to be consumed */ + int is_contig; /* whether view of file or buffer is contiguous */ +} PNCIO_View; + +typedef struct { + MPI_Comm comm; /* communicator indicating who called open */ + const char *filename; + int file_system; /* type of file system */ + + int fd_sys; /* system file descriptor */ + int num_nodes; /* number of unique compute nodes from + * MPI_Get_processor_name() */ + int *node_ids; /* [nprocs] node IDs of each rank */ + int access_mode; /* Access mode (sequential, append, etc.), + * possibly modified to deal with + * data sieving or deferred open */ + + int is_open; /* no_indep_rw, 0: not open yet 1: is open */ + + int skip_read; /* whether to skip reads in read-modify-write */ + + MPI_Offset disp; /* file displacement */ + MPI_Datatype filetype; /* file type set in fileview */ + /* etype in fileview is always MPI_BYTE in PnetCDF */ + PNCIO_View flat_file; /* flattern filetype */ + + int atomicity; /* true=atomic, false=nonatomic */ + char *io_buf; /* two-phase buffer allocated out of i/o path */ + int is_agg; /* bool: if I am an aggregator */ + int my_cb_nodes_index; /* my index into fd->hints->ranklist[]. -1 if N/A */ + PNCIO_Hints *hints; /* structure containing fs-indep. info values */ + MPI_Info info; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double write_timing[NMEASURES]; + double read_timing[NMEASURES]; + MPI_Count write_counter[NMEASURES]; + MPI_Count read_counter[NMEASURES]; +#endif +} PNCIO_File; + +typedef struct { + MPI_Offset *offsets; /* array of offsets */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *lens; /* array of lengths */ + MPI_Count *mem_ptrs; /* array of pointers. used in the read/write phase to + * indicate where the data is stored in memory + * promoted to MPI_Count so we can construct types + * with _c versions + */ + MPI_Count count; /* size of above arrays */ +#else + int *lens; + MPI_Aint *mem_ptrs; + size_t count; +#endif + size_t curr; /* index of offsets/lens that is currently being processed */ +} PNCIO_Access; + +/*---- APIs -----------------------------------------------------------------*/ +extern +int PNCIO_FileSysType(const char *filename); + +extern +int PNCIO_File_open(MPI_Comm comm, const char *filename, int amode, + MPI_Info info, PNCIO_File *fh); + +extern +int PNCIO_File_close(PNCIO_File *fh); + +extern +int PNCIO_File_set_view(PNCIO_File *fh, MPI_Offset disp, MPI_Datatype filetype, + MPI_Aint npairs, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *offsets, MPI_Count *lengths +#else + MPI_Offset *offsets, int *lengths +#endif +); + +extern +int PNCIO_File_sync(PNCIO_File *fh); + +extern +int PNCIO_File_delete(const char *filename); + +extern +int PNCIO_File_set_size(PNCIO_File *fh, MPI_Offset size); + +extern +int PNCIO_File_get_size(PNCIO_File *fh, MPI_Offset *size); + +extern +int PNCIO_File_get_info(PNCIO_File *fh, MPI_Info *info_used); + +extern +int PNCIO_File_SetInfo(PNCIO_File *fh, MPI_Info users_info); + +/* PNC I/O APIs */ +extern +MPI_Offset PNCIO_File_write_at(PNCIO_File *fh, MPI_Offset offset, + const void *buf, PNCIO_View buf_view); +extern +MPI_Offset PNCIO_File_write_at_all(PNCIO_File *fh, MPI_Offset offset, + const void *buf, PNCIO_View buf_view); + +extern +MPI_Offset PNCIO_File_read_at(PNCIO_File *fh, MPI_Offset offset, void *buf, + PNCIO_View buf_view); +extern +MPI_Offset PNCIO_File_read_at_all(PNCIO_File *fh, MPI_Offset offset, void *buf, + PNCIO_View buf_view); + +extern +MPI_Offset PNCIO_WriteContig(PNCIO_File *fd, const void *buf, + MPI_Offset w_size, MPI_Offset offset); + +extern +MPI_Offset PNCIO_ReadContig(PNCIO_File *fd, void *buf, MPI_Offset r_size, + MPI_Offset offset); + +/* utility APIs */ +extern +void PNCIO_Calc_file_domains(MPI_Offset * st_offsets, + MPI_Offset *end_offsets, int nprocs, int nprocs_for_coll, + MPI_Offset *min_st_offset_ptr, MPI_Offset **fd_start_ptr, + MPI_Offset **fd_end_ptr, MPI_Offset *fd_size_ptr, + int striping_unit); + +extern +void PNCIO_Calc_my_req(PNCIO_File *fd, MPI_Offset min_st_offset, + MPI_Offset *fd_start, MPI_Offset *fd_end, MPI_Offset fd_size, + int nprocs, MPI_Count *count_my_req_procs_ptr, + MPI_Count **count_my_req_per_proc_ptr, + PNCIO_Access **my_req_ptr, MPI_Aint **buf_idx_ptr); + +extern +void PNCIO_Calc_others_req(PNCIO_File *fd, MPI_Count count_my_req_procs, + MPI_Count *count_my_req_per_proc, PNCIO_Access *my_req, + int nprocs, int myrank, MPI_Count *count_others_req_procs_ptr, + MPI_Count **count_others_req_per_proc_ptr, + PNCIO_Access **others_req_ptr); + +extern +void PNCIO_Free_my_req(MPI_Count *count_my_req_per_proc, + PNCIO_Access *my_req, MPI_Aint *buf_idx); + +extern +void PNCIO_Free_others_req(MPI_Count *count_others_req_per_proc, + PNCIO_Access *others_req); + + +extern +int PNCIO_Calc_aggregator(PNCIO_File *fd, MPI_Offset off, MPI_Offset min_off, + MPI_Offset *len, MPI_Offset fd_size, MPI_Offset *fd_end); + +extern +void PNCIO_Heap_merge(PNCIO_Access *others_req, MPI_Count *count, + MPI_Offset *srt_off, MPI_Count *srt_len, MPI_Count *start_pos, + int nprocs, int nprocs_recv, MPI_Count total_elements); + +/* Generic APIs */ +extern +int PNCIO_GEN_SetLock(PNCIO_File *fd, int cmd, int type, MPI_Offset offset, + int whence, MPI_Offset len); + +extern +int PNCIO_GEN_SetLock64(PNCIO_File *fd, int cmd, int type, MPI_Offset offset, + int whence, MPI_Offset len); + +extern +MPI_Offset PNCIO_GEN_WriteStrided(PNCIO_File *fd, const void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_GEN_ReadStrided_naive(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_GEN_ReadStridedColl(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_GEN_WriteStrided_naive(PNCIO_File *fd, const void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_GEN_ReadStrided(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_GEN_WriteStridedColl(PNCIO_File *fd, const void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +/* Lustre */ +extern +int PNCIO_Lustre_create(PNCIO_File *fd, int access_mode); + +extern +int PNCIO_Lustre_open(PNCIO_File *fd); + +extern +MPI_Offset PNCIO_LUSTRE_WriteStrided(PNCIO_File *fd, const void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +extern +MPI_Offset PNCIO_LUSTRE_WriteStridedColl(PNCIO_File *fd, const void *buf, + PNCIO_View buf_view, MPI_Offset offset); + +#endif diff --git a/src/drivers/pncio/pncio_aggregate.c b/src/drivers/pncio/pncio_aggregate.c new file mode 100644 index 000000000..b75e48d92 --- /dev/null +++ b/src/drivers/pncio/pncio_aggregate.c @@ -0,0 +1,560 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +/* This file contains four functions: + * + * PNCIO_Calc_aggregator() + * PNCIO_Calc_file_domains() + * PNCIO_Calc_my_req() + * PNCIO_Free_my_req() + * PNCIO_Calc_others_req() + * PNCIO_Free_others_req() + * + * The last three of these were originally in ad_read_coll.c, but they are + * also shared with ad_write_coll.c. I felt that they were better kept with + * the rest of the shared aggregation code. + */ + +/* Discussion of values available from above: + * + * MPI_Offset st_offsets[0..nprocs-1] + * MPI_Offset end_offsets[0..nprocs-1] + * These contain a list of start and end offsets for each process in + * the communicator. For example, an access at loc 10, size 10 would + * have a start offset of 10 and end offset of 19. + * int nprocs + * number of processors in the collective I/O communicator + * MPI_Offset min_st_offset + * MPI_Offset fd_start[0..nprocs_for_coll-1] + * starting location of "file domain"; region that a given process will + * perform aggregation for (i.e. actually do I/O) + * MPI_Offset fd_end[0..nprocs_for_coll-1] + * start + size - 1 roughly, but it can be less, or 0, in the case of + * uneven distributions + */ + +/* PNCIO_Calc_aggregator() + * + * The intention here is to implement a function which provides basically + * the same functionality as in Rajeev's original version of + * PNCIO_Calc_my_req(). He used a ceiling division approach to assign the + * file domains, and we use the same approach here when calculating the + * location of an offset/len in a specific file domain. Further we assume + * this same distribution when calculating the rank_index, which is later + * used to map to a specific process rank in charge of the file domain. + * + * A better (i.e. more general) approach would be to use the list of file + * domains only. This would be slower in the case where the + * original ceiling division was used, but it would allow for arbitrary + * distributions of regions to aggregators. We'd need to know the + * nprocs_for_coll in that case though, which we don't have now. + * + * Note a significant difference between this function and Rajeev's old code: + * this code doesn't necessarily return a rank in the range + * 0..nprocs_for_coll; instead you get something in 0..nprocs. This is a + * result of the rank mapping; any set of ranks in the communicator could be + * used now. + * + * Returns an integer representing a rank in the collective I/O communicator. + * + * The "len" parameter is also modified to indicate the amount of data + * actually available in this file domain. + */ +int PNCIO_Calc_aggregator(PNCIO_File *fd, + MPI_Offset off, + MPI_Offset min_off, + MPI_Offset *len, + MPI_Offset fd_size, + MPI_Offset *fd_end) +{ + int rank_index, rank; + MPI_Offset avail_bytes; + + /* get an index into our array of aggregators */ + rank_index = (int) ((off - min_off + fd_size) / fd_size - 1); + + if (fd->hints->striping_unit > 0) { + /* Implementation for file domain alignment. Note fd_end[] have been + * aligned with file system lock boundaries when it was produced by + * PNCIO_Calc_file_domains(). + */ + rank_index = 0; + while (off > fd_end[rank_index]) + rank_index++; + } + + /* we index into fd_end with rank_index, and fd_end was allocated to be no + * bigger than fd->hins->cb_nodes. If we ever violate that, we're + * overrunning arrays. Obviously, we should never ever hit this abort */ + if (rank_index >= fd->hints->cb_nodes || rank_index < 0) { + fprintf(stderr, + "Error in PNCIO_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size="OFFFMT" off="OFFFMT"\n", + rank_index, fd->hints->cb_nodes, fd_size, off); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + /* remember here that even in Rajeev's original code it was the case that + * different aggregators could end up with different amounts of data to + * aggregate. here we use fd_end[] to make sure that we know how much + * data this aggregator is working with. + * + * the +1 is to take into account the end vs. length issue. + */ + avail_bytes = fd_end[rank_index] + 1 - off; + if (avail_bytes < *len) { + /* this file domain only has part of the requested contig. region */ + *len = avail_bytes; + } + + /* map our index to a rank */ + /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */ + rank = fd->hints->ranklist[rank_index]; + + return rank; +} + +void PNCIO_Calc_file_domains(MPI_Offset *st_offsets, + MPI_Offset *end_offsets, + int nprocs, + int nprocs_for_coll, + MPI_Offset *min_st_offset_ptr, + MPI_Offset **fd_start_ptr, + MPI_Offset **fd_end_ptr, + MPI_Offset *fd_size_ptr, + int striping_unit) +{ +/* Divide the I/O workload among "nprocs_for_coll" processes. This is + done by (logically) dividing the file into file domains (FDs); each + process may directly access only its own file domain. */ + + MPI_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, fd_size; + int i; + +/* find min of start offsets and max of end offsets of all processes */ + + min_st_offset = st_offsets[0]; + max_end_offset = end_offsets[0]; + + for (i = 1; i < nprocs; i++) { + min_st_offset = MIN(min_st_offset, st_offsets[i]); + max_end_offset = MAX(max_end_offset, end_offsets[i]); + } + +/* determine the "file domain (FD)" of each process, i.e., the portion of + the file that will be "owned" by each process */ + +/* partition the total file access range equally among nprocs_for_coll + processes */ + fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - 1) / nprocs_for_coll; + /* ceiling division as in HPF block distribution */ + + *fd_start_ptr = (MPI_Offset *) NCI_Malloc(nprocs_for_coll * 2 * sizeof(MPI_Offset)); + *fd_end_ptr = *fd_start_ptr + nprocs_for_coll; + + fd_start = *fd_start_ptr; + fd_end = *fd_end_ptr; + + /* Wei-keng Liao: implementation for fild domain alignment to nearest file + * lock boundary (as specified by striping_unit hint). Could also + * experiment with other alignment strategies here */ + if (striping_unit > 0) { + MPI_Offset end_off; + int rem_front, rem_back; + + /* align fd_end[0] to the nearest file lock boundary */ + fd_start[0] = min_st_offset; + end_off = fd_start[0] + fd_size; + rem_front = end_off % striping_unit; + rem_back = striping_unit - rem_front; + if (rem_front < rem_back) + end_off -= rem_front; + else + end_off += rem_back; + fd_end[0] = end_off - 1; + + /* align fd_end[i] to the nearest file lock boundary */ + for (i = 1; i < nprocs_for_coll; i++) { + fd_start[i] = fd_end[i - 1] + 1; + end_off = min_st_offset + fd_size * (i + 1); + rem_front = end_off % striping_unit; + rem_back = striping_unit - rem_front; + if (rem_front < rem_back) + end_off -= rem_front; + else + end_off += rem_back; + fd_end[i] = end_off - 1; + } + fd_end[nprocs_for_coll - 1] = max_end_offset; + } else { /* no hints set: do things the 'old' way */ + fd_start[0] = min_st_offset; + fd_end[0] = min_st_offset + fd_size - 1; + + for (i = 1; i < nprocs_for_coll; i++) { + fd_start[i] = fd_end[i - 1] + 1; + fd_end[i] = fd_start[i] + fd_size - 1; + } + } + +/* take care of cases in which the total file access range is not + divisible by the number of processes. In such cases, the last + process, or the last few processes, may have unequal load (even 0). + For example, a range of 97 divided among 16 processes. + Note that the division is ceiling division. */ + + for (i = 0; i < nprocs_for_coll; i++) { + if (fd_start[i] > max_end_offset) + fd_start[i] = fd_end[i] = -1; + if (fd_end[i] > max_end_offset) + fd_end[i] = max_end_offset; + } + + *fd_size_ptr = fd_size; + *min_st_offset_ptr = min_st_offset; +} + + +/* PNCIO_Calc_my_req() - calculate what portions of the access requests + * of this process are located in the file domains of various processes + * (including this one) + */ +void PNCIO_Calc_my_req(PNCIO_File *fd, + MPI_Offset min_st_offset, + MPI_Offset *fd_start, + MPI_Offset *fd_end, + MPI_Offset fd_size, + int nprocs, + MPI_Count *count_my_req_procs_ptr, + MPI_Count **count_my_req_per_proc_ptr, + PNCIO_Access **my_req_ptr, + MPI_Aint **buf_idx_ptr) +/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets? + They are used as memory buffer indices so it seems like the 2G limit is in effect */ +{ + MPI_Count *count_my_req_per_proc, count_my_req_procs, l; + MPI_Aint *buf_idx; + int proc; + size_t memLen, alloc_sz; + MPI_Offset fd_len, rem_len, curr_idx, off, *off_ptr; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *len_ptr; +#else + int *len_ptr; +#endif + PNCIO_Access *my_req; + + *count_my_req_per_proc_ptr = NCI_Calloc(nprocs, sizeof(MPI_Count)); + count_my_req_per_proc = *count_my_req_per_proc_ptr; +/* count_my_req_per_proc[i] gives the no. of contig. requests of this + process in process i's file domain. calloc initializes to zero. + I'm allocating memory of size nprocs, so that I can do an + MPI_Alltoall later on.*/ + + buf_idx = (MPI_Aint *) NCI_Malloc(nprocs * sizeof(MPI_Aint)); +/* buf_idx is relevant only if buftype_is_contig. + buf_idx[i] gives the index into user_buf where data received + from proc. i should be placed. This allows receives to be done + without extra buffer. This can't be done if buftype is not contig. */ + + /* initialize buf_idx to -1 */ + for (int i = 0; i < nprocs; i++) + buf_idx[i] = -1; + + /* one pass just to calculate how much space to allocate for my_req */ + for (MPI_Count i = 0; i < fd->flat_file.count; i++) { + /* short circuit offset/len processing if len == 0 + * (zero-byte read/write */ + if (fd->flat_file.len[i] == 0) + continue; + off = fd->flat_file.off[i]; + fd_len = fd->flat_file.len[i]; + /* note: we set fd_len to be the total size of the access. then + * PNCIO_Calc_aggregator() will modify the value to return the + * amount that was available from the file domain that holds the + * first part of the access. + */ + proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_end); + count_my_req_per_proc[proc]++; + + /* figure out how much data is remaining in the access (i.e. wasn't + * part of the file domain that had the starting byte); we'll take + * care of this data (if there is any) in the while loop below. + */ + rem_len = fd->flat_file.len[i] - fd_len; + + while (rem_len != 0) { + off += fd_len; /* point to first remaining byte */ + fd_len = rem_len; /* save remaining size, pass to calc */ + proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len, + fd_size, fd_end); + + count_my_req_per_proc[proc]++; + rem_len -= fd_len; /* reduce remaining length by amount from fd */ + } + } + +/* now allocate space for my_req, offset, and len */ + + *my_req_ptr = (PNCIO_Access *) NCI_Malloc(nprocs * sizeof(PNCIO_Access)); + my_req = *my_req_ptr; + + /* combine offsets and lens into a single regions so we can make one + * exchange instead of two later on. Over-allocate the 'offsets' array and + * make 'lens' point to the over-allocated part + */ + memLen = 0; + for (int i = 0; i < nprocs; i++) + memLen += count_my_req_per_proc[i]; + +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) * 2; + my_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz); + my_req[0].lens = my_req[0].offsets + memLen; +#else + alloc_sz = sizeof(MPI_Offset) + sizeof(int); + my_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz); + my_req[0].lens = (int*) (my_req[0].offsets + memLen); +#endif + + off_ptr = my_req[0].offsets; + len_ptr = my_req[0].lens; + count_my_req_procs = 0; + for (int i = 0; i < nprocs; i++) { + if (count_my_req_per_proc[i]) { + my_req[i].offsets = off_ptr; + off_ptr += count_my_req_per_proc[i]; + my_req[i].lens = len_ptr; + len_ptr += count_my_req_per_proc[i]; + count_my_req_procs++; + } + my_req[i].count = 0; /* will be incremented where needed + * later */ + } + +/* now fill in my_req */ + curr_idx = 0; + for (MPI_Count i = 0; i < fd->flat_file.count; i++) { + /* short circuit offset/len processing if len == 0 + * (zero-byte read/write */ + if (fd->flat_file.len[i] == 0) + continue; + off = fd->flat_file.off[i]; + fd_len = fd->flat_file.len[i]; + proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_end); + + /* for each separate contiguous access from this process */ + if (buf_idx[proc] == -1) { + assert(curr_idx == (MPI_Aint) curr_idx); + buf_idx[proc] = (MPI_Aint) curr_idx; + } + + l = my_req[proc].count; + curr_idx += fd_len; + + rem_len = fd->flat_file.len[i] - fd_len; + + /* store the proc, offset, and len information in an array + * of structures, my_req. Each structure contains the + * offsets and lengths located in that process's FD, + * and the associated count. + */ + my_req[proc].offsets[l] = off; + my_req[proc].lens[l] = fd_len; + my_req[proc].count++; + + while (rem_len != 0) { + off += fd_len; + fd_len = rem_len; + proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len, + fd_size, fd_end); + + if (buf_idx[proc] == -1) { + assert(curr_idx == (MPI_Aint) curr_idx); + buf_idx[proc] = (MPI_Aint) curr_idx; + } + + l = my_req[proc].count; + curr_idx += fd_len; + rem_len -= fd_len; + + my_req[proc].offsets[l] = off; + my_req[proc].lens[l] = fd_len; + my_req[proc].count++; + } + } + + *count_my_req_procs_ptr = count_my_req_procs; + *buf_idx_ptr = buf_idx; +} + +void PNCIO_Free_my_req(MPI_Count *count_my_req_per_proc, + PNCIO_Access *my_req, + MPI_Aint *buf_idx) +{ + NCI_Free(count_my_req_per_proc); + NCI_Free(my_req[0].offsets); + NCI_Free(my_req); + NCI_Free(buf_idx); +} + +void PNCIO_Calc_others_req(PNCIO_File *fd, + MPI_Count count_my_req_procs, + MPI_Count *count_my_req_per_proc, + PNCIO_Access *my_req, + int nprocs, + int myrank, + MPI_Count *count_others_req_procs_ptr, + MPI_Count **count_others_req_per_proc_ptr, + PNCIO_Access **others_req_ptr) +{ +/* determine what requests of other processes lie in this process's + file domain */ + +/* count_others_req_procs = number of processes whose requests lie in + this process's file domain (including this process itself) + count_others_req_per_proc[i] indicates how many separate contiguous + requests of proc. i lie in this process's file domain. */ + + MPI_Count *count_others_req_per_proc, count_others_req_procs; + size_t alloc_sz; + int i, j; + MPI_Request *requests; + PNCIO_Access *others_req; + size_t memLen; + MPI_Offset *off_ptr; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *len_ptr; + MPI_Count *mem_ptr; +#else + int *len_ptr; + MPI_Aint *mem_ptr; +#endif + +/* first find out how much to send/recv and from/to whom */ + count_others_req_per_proc = NCI_Malloc(nprocs * sizeof(MPI_Count)); + + MPI_Alltoall(count_my_req_per_proc, 1, MPI_COUNT, + count_others_req_per_proc, 1, MPI_COUNT, fd->comm); + + *others_req_ptr = (PNCIO_Access *) NCI_Malloc(nprocs * sizeof(PNCIO_Access)); + others_req = *others_req_ptr; + + memLen = 0; + for (i = 0; i < nprocs; i++) + memLen += count_others_req_per_proc[i]; + +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) * 2 + sizeof(MPI_Count); + others_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz); + others_req[0].lens = others_req[0].offsets + memLen; + others_req[0].mem_ptrs = (MPI_Count*) (others_req[0].lens + memLen); +#else + alloc_sz = sizeof(MPI_Offset) + sizeof(int) + sizeof(MPI_Aint); + others_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz); + others_req[0].lens = (int *) (others_req[0].offsets + memLen); + others_req[0].mem_ptrs = (MPI_Aint*) (others_req[0].lens + memLen); +#endif + off_ptr = others_req[0].offsets; + len_ptr = others_req[0].lens; + mem_ptr = others_req[0].mem_ptrs; + + count_others_req_procs = 0; + for (i = 0; i < nprocs; i++) { + if (count_others_req_per_proc[i]) { + others_req[i].count = count_others_req_per_proc[i]; + others_req[i].offsets = off_ptr; + off_ptr += count_others_req_per_proc[i]; + others_req[i].lens = len_ptr; + len_ptr += count_others_req_per_proc[i]; + others_req[i].mem_ptrs = mem_ptr; + mem_ptr += count_others_req_per_proc[i]; + count_others_req_procs++; + } else + others_req[i].count = 0; + } + *count_others_req_per_proc_ptr = count_others_req_per_proc; + +/* now send the calculated offsets and lengths to respective processes */ + + requests = (MPI_Request *) + NCI_Malloc((count_my_req_procs + count_others_req_procs) * 2 * sizeof(MPI_Request)); + + j = 0; + for (i = 0; i < nprocs; i++) { + if (others_req[i].count == 0) + continue; + if (i == myrank) { + /* send to self uses memcpy()C, here others_req[i].count == my_req[i].count */ + memcpy(others_req[i].offsets, my_req[i].offsets, + my_req[i].count * sizeof(MPI_Offset)); +#ifdef HAVE_MPI_LARGE_COUNT + memcpy(others_req[i].lens, my_req[i].lens, + my_req[i].count * sizeof(MPI_Offset)); +#else + memcpy(others_req[i].lens, my_req[i].lens, + my_req[i].count * sizeof(int)); +#endif + } + else { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Irecv_c(others_req[i].offsets, others_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); + MPI_Irecv_c(others_req[i].lens, others_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); +#else + assert(others_req[i].count <= 2147483647); /* overflow 4-byte int */ + MPI_Irecv(others_req[i].offsets, (int)others_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); + MPI_Irecv(others_req[i].lens, (int)others_req[i].count, + MPI_INT, i, i + myrank, fd->comm, &requests[j++]); +#endif + } + } + + for (i = 0; i < nprocs; i++) { + if (my_req[i].count && i != myrank) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Isend_c(my_req[i].offsets, my_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); + MPI_Isend_c(my_req[i].lens, my_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); +#else + assert(my_req[i].count <= 2147483647); /* overflow 4-byte int */ + MPI_Isend(my_req[i].offsets, (int)my_req[i].count, + MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]); + MPI_Isend(my_req[i].lens, (int)my_req[i].count, + MPI_INT, i, i + myrank, fd->comm, &requests[j++]); +#endif + } + } + + if (j) { +#ifdef HAVE_MPI_STATUSES_IGNORE + MPI_Waitall(j, requests, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) NCI_Malloc(j * sizeof(MPI_Status)); + MPI_Waitall(j, requests, statuses); + NCI_Free(statuses); +#endif + } + + NCI_Free(requests); + + *count_others_req_procs_ptr = count_others_req_procs; +} + +void PNCIO_Free_others_req(MPI_Count *count_others_req_per_proc, + PNCIO_Access *others_req) +{ + NCI_Free(count_others_req_per_proc); + NCI_Free(others_req[0].offsets); + NCI_Free(others_req); +} + diff --git a/src/drivers/pncio/pncio_close.c b/src/drivers/pncio/pncio_close.c new file mode 100644 index 000000000..4ecc09cc6 --- /dev/null +++ b/src/drivers/pncio/pncio_close.c @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* strdup() */ +#include +#include + +#include + +#include "pncio.h" + +/*----< PNCIO_File_close() >--------------------------------------------------*/ +int PNCIO_File_close(PNCIO_File *fh) +{ + int err = NC_NOERR; + + err = close(fh->fd_sys); + if (err != 0) + err = ncmpii_error_posix2nc("close"); + + if (fh->hints->ranklist != NULL) + NCI_Free(fh->hints->ranklist); + if (fh->hints != NULL) + NCI_Free(fh->hints); + if (fh->info != MPI_INFO_NULL) + MPI_Info_free(&(fh->info)); + if (fh->io_buf != NULL) + NCI_Free(fh->io_buf); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + int i, rank; + double timing[NMEASURES*2], max_t[NMEASURES*2], pread_t; + MPI_Count max_ntimes, counter[NMEASURES*2], max_c[NMEASURES*2]; + + /* print two-phase I/O timing breakdown */ + MPI_Comm_rank(fh->comm, &rank); + for (i=0; iwrite_timing[i]; + counter[i] = fh->write_counter[i]; + timing[i+NMEASURES] = fh->read_timing[i]; + counter[i+NMEASURES] = fh->read_counter[i]; + } + MPI_Reduce(timing, max_t, NMEASURES*2, MPI_DOUBLE, MPI_MAX, 0, fh->comm); + MPI_Reduce(counter, max_c, NMEASURES*2, MPI_COUNT, MPI_MAX, 0, fh->comm); + + pread_t = max_t[NMEASURES+2]; + max_ntimes = max_c[0]; + + if (rank == 0 && max_ntimes > 0) { + printf("%s: TWO-PHASE write init %5.2f pwrite %5.2f pread %5.2f post %5.2f hsort %5.2f comm %5.2f collw %5.2f\n", + __func__, max_t[1], max_t[2], pread_t, max_t[4], max_t[5], max_t[3], max_t[0]); + printf("%s: TWO-PHASE write ntimes %lld check_hole %lld (total_num %lld nrecv %lld) no check %lld (total_num %lld nrecv %lld)\n", + __func__, max_c[0], max_c[1], max_c[2], max_c[3], max_c[4], max_c[5], max_c[6]); + } + + max_ntimes = max_c[NMEASURES]; + + if (rank == 0 && max_ntimes > 0) + printf("%s: TWO-PHASE read init %5.2f pread %5.2f post %5.2f wait %5.2f collr %5.2f ntimes %lld\n", + __func__, max_t[NMEASURES+1], max_t[NMEASURES+2], max_t[NMEASURES+4], max_t[NMEASURES+3], max_t[NMEASURES+0], max_ntimes); +#endif + + return err; +} diff --git a/src/drivers/pncio/pncio_delete.c b/src/drivers/pncio/pncio_delete.c new file mode 100644 index 000000000..514f3a325 --- /dev/null +++ b/src/drivers/pncio/pncio_delete.c @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#ifdef HAVE_UNISTD_H +#include /* unlink() */ +#endif + +#include +#include "pncio.h" + +/*----< PNCIO_File_delete() >-------------------------------------------------*/ +int PNCIO_File_delete(const char *filename) +{ + int err = NC_NOERR; + char *path = ncmpii_remove_file_system_type_prefix(filename); + + err = unlink(path); + if (err != 0) + err = ncmpii_error_posix2nc("unlink"); + + return err; +} + diff --git a/src/drivers/pncio/pncio_fstype.c b/src/drivers/pncio/pncio_fstype.c new file mode 100644 index 000000000..9713b7011 --- /dev/null +++ b/src/drivers/pncio/pncio_fstype.c @@ -0,0 +1,233 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* readlink() */ +#include /* strdup() */ +#include +#include +#include /* open(), O_CREAT */ +#include /* open() */ +#include /* basename() */ + +#ifdef HAVE_LIMITS_H +#include +#endif +#ifndef PATH_MAX +#define PATH_MAX 65535 +#endif + +#ifdef HAVE_SYS_VFS_H +#include +#endif +#ifdef HAVE_SYS_STATVFS_H +#include +#endif +#ifdef HAVE_SYS_PARAM_H +#include /* struct statfs */ +#endif +#ifdef HAVE_SYS_MOUNT_H +#include /* struct statfs */ +#endif +#ifdef HAVE_SYS_STAT_H +#include /* open(), fstat(), lstat(), stat() */ +#endif + +#include + +#include "pncio.h" + +/* In a strict ANSI environment, S_ISLNK may not be defined. Fix that here. + * We assume that S_ISLNK is *always* defined as a macro. If that is not + * universally true, then add a test to the configure that tries to link + * a program that references S_ISLNK + */ +#if !defined(S_ISLNK) +#if defined(S_IFLNK) +/* Check for the link bit */ +#define S_ISLNK(mode) ((mode) & S_IFLNK) +#else +/* no way to check if it is a link, so say false */ +#define S_ISLNK(mode) 0 +#endif +#endif /* !(S_ISLNK) */ + +/* Returns a string, the parent directory of a given filename. + * The caller should free the memory located returned by this subroutine. + */ +static +void parentdir(const char *filename, char **dirnamep) +{ + int err; + char *dir = NULL, *slash; + struct stat statbuf; + + err = lstat(filename, &statbuf); + + if (err || (!S_ISLNK(statbuf.st_mode))) { + /* No such file, or file is not a link; these are the "normal" cases + * where we can just return the parent directory. + */ + dir = NCI_Strdup(filename); + } else { + /* filename is a symlink. We've presumably already tried to stat it + * and found it to be missing (dangling link), but this code doesn't + * care if the target is really there or not. + */ + ssize_t namelen; + char *linkbuf; + + linkbuf = NCI_Malloc(PATH_MAX + 1); + namelen = readlink(filename, linkbuf, PATH_MAX + 1); + if (namelen == -1) { + /* Something strange has happened between the time that we + * determined that this was a link and the time that we attempted + * to read it; punt and use the old name. + */ + dir = NCI_Strdup(filename); + } else { + /* successfully read the link */ + linkbuf[namelen] = '\0'; /* readlink doesn't null terminate */ + dir = NCI_Strdup(linkbuf); + } + NCI_Free(linkbuf); + } + + slash = strrchr(dir, '/'); + if (!slash) + strncpy(dir, ".", 2); + else { + if (slash == dir) + *(dir + 1) = '\0'; + else + *slash = '\0'; + } + + *dirnamep = dir; + return; +} + +#define UNKNOWN_SUPER_MAGIC (0xDEADBEEF) +#ifndef LL_SUPER_MAGIC +#define LL_SUPER_MAGIC 0x0BD00BD0 +#endif + +static int check_statfs(const char *filename, int64_t * file_id) +{ + int err = 0; + +#ifdef HAVE_STRUCT_STATVFS_WITH_F_BASETYPE + /* rare: old solaris machines */ + struct statvfs vfsbuf; +#endif +#if defined(HAVE_STRUCT_STATFS_F_TYPE) || defined(HAVE_STRUCT_STATFS_F_FSTYPENAME) + /* common fs-detection logic for any modern POSIX-compliant environment, + * with the one wrinkle that some platforms (Darwin, BSD) give us a file + * system as a string, not an identifier */ + struct statfs fsbuf; +#endif + + *file_id = UNKNOWN_SUPER_MAGIC; + +#ifdef HAVE_STRUCT_STATVFS_WITH_F_BASETYPE + err = statvfs(filename, &vfsbuf); + if (err == 0) + *file_id = vfsbuf.f_basetype; +#endif + + /* remember above how I said 'statfs with f_type' was the common linux-y + * way to report file system type? Darwin (and probably the BSDs) *also* + * uses f_type but it is "reserved" and does not give us anything + * meaningful. Fine. If configure detects f_type we'll use it here and on + * those "reserved" platforms we'll ignore that result and check the + * f_fstypename field. + */ + +#ifdef HAVE_STRUCT_STATFS_F_TYPE + err = statfs(filename, &fsbuf); + if (err == 0) { + *file_id = fsbuf.f_type; + return 0; + } +#endif + +#ifdef HAVE_STRUCT_STATFS_F_FSTYPENAME + /* these stat routines store the file system type in a string */ + err = statfs(filename, &fsbuf); + if (err == 0 && !strncasecmp(fsbuf.f_fstypename, "lustre", 6)) { + *file_id = LL_SUPER_MAGIC; + return 0; + } +#endif + +#ifdef HAVE_STRUCT_STAT_ST_FSTYPE + struct stat sbuf; + err = stat(filename, &sbuf); + if (err == 0) { + *file_id = sbuf.st_fstype; + return 0; + } +#endif + return err; +} + +/* Check if file system type from file name, using a system-dependent function + * call. + */ +int PNCIO_FileSysType(const char *filename) +{ + + int err, retry_cnt; + int64_t file_id=UNKNOWN_SUPER_MAGIC; + + char *colon = strchr(filename, ':'); + if (colon != NULL) { /* there is a prefix end with : */ + if (!strncmp(filename, "lustre", 6)) + return PNCIO_LUSTRE; + else if (!strncmp(filename, "ufs", 3)) + return PNCIO_UFS; + else + return 0; + } +#ifdef MIMIC_LUSTRE + return PNCIO_LUSTRE; +#endif + + /* NFS can get stuck and end up returning ESTALE "forever" */ + +#define MAX_ESTALE_RETRY 10000 + + retry_cnt = 0; + do { + err = check_statfs(filename, &file_id); + } while (err && (errno == ESTALE) && retry_cnt++ < MAX_ESTALE_RETRY); + + if (err) { + /* ENOENT may be returned in two cases: + * 1) no directory entry for "filename" + * 2) "filename" is a dangling symbolic link + * + * parentdir() tries to deal with both cases. + */ + if (errno == ENOENT) { + char *dir; + parentdir(filename, &dir); + err = check_statfs(dir, &file_id); + NCI_Free(dir); + } else + return 0; + } + + if (file_id == LL_SUPER_MAGIC) + return PNCIO_LUSTRE; + else + return PNCIO_UFS; /* UFS support if we don't know what else to use */ +} + diff --git a/src/drivers/pncio/pncio_hints.c b/src/drivers/pncio/pncio_hints.c new file mode 100644 index 000000000..0e7db29a3 --- /dev/null +++ b/src/drivers/pncio/pncio_hints.c @@ -0,0 +1,326 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include "pncio.h" + +/*----< PNCIO_File_get_info() >-----------------------------------------------*/ +int PNCIO_File_get_info(PNCIO_File *fd, + MPI_Info *info_used) +{ + int err; + + err = MPI_Info_dup(fd->info, info_used); + if (err == MPI_SUCCESS) + err = NC_NOERR; + else + err = ncmpii_error_mpi2nc(err, "MPI_Info_dup"); + + return err; +} + +/*----< Info_check_and_install_int() >---------------------------------------*/ +static +int Info_check_and_install_int(PNCIO_File *fd, + MPI_Info info, + const char *key, + int *local_cache) +{ + int intval, tmp_val, flag, ret = 0; + char value[MPI_MAX_INFO_VAL + 1]; + + MPI_Info_get(info, key, MPI_MAX_INFO_VAL, value, &flag); + if (flag) { + intval = atoi(value); + tmp_val = intval; + + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); + /* --BEGIN ERROR HANDLING-- */ + if (tmp_val != intval) { + ret = ncmpii_error_mpi2nc(MPI_ERR_NOT_SAME, __func__); + goto fn_exit; + } + /* --END ERROR HANDLING-- */ + + MPI_Info_set(fd->info, key, value); + /* some file systems do not cache hints in the fd struct */ + if (local_cache != NULL) + *local_cache = intval; + } +fn_exit: + return ret; +} + +/*----< Info_check_and_install_enabled() >-----------------------------------*/ +static +int Info_check_and_install_enabled(PNCIO_File *fd, + MPI_Info info, + const char *key, + int *local_cache) +{ + int tmp_val, flag, ret = 0; + char value[MPI_MAX_INFO_VAL + 1]; + + MPI_Info_get(info, key, MPI_MAX_INFO_VAL, value, &flag); + if (flag) { + if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) { + MPI_Info_set(fd->info, key, value); + *local_cache = PNCIO_HINT_ENABLE; + } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) { + MPI_Info_set(fd->info, key, value); + *local_cache = PNCIO_HINT_DISABLE; + } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) { + MPI_Info_set(fd->info, key, value); + *local_cache = PNCIO_HINT_AUTO; + /* treat the user-provided string like "enabled": either it is a + * hint ROMIO knows about and can support it, or ROMIO will not + * return the hint at all in the MPI_File_get_info info object + */ + } else if (!strcmp(value, "requested") || !strcmp(value, "REQUESTED")) { + MPI_Info_set(fd->info, key, "enable"); + *local_cache = PNCIO_HINT_ENABLE; + } + + tmp_val = *local_cache; + + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); + /* --BEGIN ERROR HANDLING-- */ + if (tmp_val != *local_cache) { + ret = ncmpii_error_mpi2nc(MPI_ERR_NOT_SAME, __func__); + goto fn_exit; + } + /* --END ERROR HANDLING-- */ + } +fn_exit: + return ret; +} + +/*----< Info_check_and_install_true() >--------------------------------------*/ +static +int Info_check_and_install_true(PNCIO_File *fd, + MPI_Info info, + const char *key, + int *local_cache) +{ + int flag, tmp_val, ret = 0; + char value[MPI_MAX_INFO_VAL + 1]; + + MPI_Info_get(info, key, MPI_MAX_INFO_VAL, value, &flag); + if (flag) { + if (!strcmp(value, "true") || !strcmp(value, "TRUE")) { + MPI_Info_set(fd->info, key, value); + *local_cache = 1; + } else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) { + MPI_Info_set(fd->info, key, value); + *local_cache = 0; + } + tmp_val = *local_cache; + + MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); + /* --BEGIN ERROR HANDLING-- */ + if (tmp_val != *local_cache) { + ret = ncmpii_error_mpi2nc(MPI_ERR_NOT_SAME, __func__); + goto fn_exit; + } + /* --END ERROR HANDLING-- */ + } +fn_exit: + return ret; +} + +#if 0 +/*----< Info_check_and_install_str() >---------------------------------------*/ +static +int Info_check_and_install_str(PNCIO_File *fd, + MPI_Info info, + const char *key, + char **local_cache) +{ + int flag, ret = 0; + size_t len; + char value[MPI_MAX_INFO_VAL + 1]; + + MPI_Info_get(info, key, MPI_MAX_INFO_VAL, value, &flag); + if (flag) { + MPI_Info_set(fd->info, key, value); + len = (strlen(value) + 1) * sizeof(char); + *local_cache = NCI_Malloc(len); + if (*local_cache == NULL) { + ret = NC_ENOMEM; + goto fn_exit; + } + strncpy(*local_cache, value, len); + } +fn_exit: + return ret; +} +#endif + +/*----< PNCIO_File_SetInfo() >------------------------------------------------*/ +/* For PnetCDF, a file info object can only be passed to PnetCDF at file create + * or open call, i.e. I/O hints cannot be changed after file create/open. + * + * When users_info == MPI_INFO_NULL, this subroutine is an independent call. + * When users_info != MPI_INFO_NULL, this subroutine is a collective call, + * because it calls Info_check_and_install_xxx(), which checks the consistency + * of all hints values set in user's info object. + * + * TODO: instead of sync each hint, a better implementation is to have root + * bcast all hints and let each process checks inconsistency locally. + */ +int +PNCIO_File_SetInfo(PNCIO_File *fd, + MPI_Info users_info) +{ + int nprocs; + char value[MPI_MAX_INFO_VAL + 1]; + + if (users_info == MPI_INFO_NULL) + return NC_NOERR; + + MPI_Comm_size(fd->comm, &nprocs); + + /* initialize fd->info and hints to default values */ + MPI_Info_create(&(fd->info)); + + /* buffer size for collective I/O */ + MPI_Info_set(fd->info, "cb_buffer_size", PNCIO_CB_BUFFER_SIZE_DFLT); + fd->hints->cb_buffer_size = atoi(PNCIO_CB_BUFFER_SIZE_DFLT); + + /* default is to let pncio automatically decide whether or not to use + * collective buffering + */ + MPI_Info_set(fd->info, "romio_cb_read", "automatic"); + fd->hints->cb_read = PNCIO_HINT_AUTO; + MPI_Info_set(fd->info, "romio_cb_write", "automatic"); + fd->hints->cb_write = PNCIO_HINT_AUTO; + + /* cb_nodes may be set later right after file open call */ + fd->hints->cb_nodes = 0; + + /* hint indicating that no indep. I/O will be performed on this file */ + MPI_Info_set(fd->info, "romio_no_indep_rw", "false"); + fd->hints->no_indep_rw = 0; + + /* buffer size for data sieving in independent reads */ + MPI_Info_set(fd->info, "ind_rd_buffer_size", PNCIO_IND_RD_BUFFER_SIZE_DFLT); + fd->hints->ind_rd_buffer_size = atoi(PNCIO_IND_RD_BUFFER_SIZE_DFLT); + + /* buffer size for data sieving in independent writes */ + MPI_Info_set(fd->info, "ind_wr_buffer_size", PNCIO_IND_WR_BUFFER_SIZE_DFLT); + fd->hints->ind_wr_buffer_size = atoi(PNCIO_IND_WR_BUFFER_SIZE_DFLT); + + /* default is to let romio automatically decide when to use data + * sieving + */ + MPI_Info_set(fd->info, "romio_ds_read", "automatic"); + fd->hints->ds_read = PNCIO_HINT_AUTO; + MPI_Info_set(fd->info, "romio_ds_write", "automatic"); + fd->hints->ds_write = PNCIO_HINT_AUTO; + + /* File striping parameters will be retrieved from the file system set, + * once the file is opened. These parameters can also be customized by + * a user's info. Thus, default values used below are to indicate + * whether or not they have been customized by the users. + */ + fd->hints->striping_unit = 0; + fd->hints->striping_factor = 0; + fd->hints->start_iodevice = -1; + /* Lustre overstriping ratio. 0 or 1 means disabled */ + fd->hints->fs_hints.lustre.overstriping_ratio = 1; + + /* add in user's info --------------------------------------------------*/ + Info_check_and_install_int(fd, users_info, "cb_buffer_size", + &fd->hints->cb_buffer_size); + + /* enable/disable collective buffering */ + Info_check_and_install_enabled(fd, users_info, "romio_cb_read", + &fd->hints->cb_read); + if (fd->hints->cb_read == PNCIO_HINT_DISABLE) { + /* romio_cb_read overrides no_indep_rw */ + MPI_Info_set(fd->info, "romio_no_indep_rw", "false"); + fd->hints->no_indep_rw = PNCIO_HINT_DISABLE; + } + + Info_check_and_install_enabled(fd, users_info, "romio_cb_write", + &fd->hints->cb_write); + if (fd->hints->cb_write == PNCIO_HINT_DISABLE) { + /* romio_cb_write overrides no_indep_rw */ + MPI_Info_set(fd->info, "romio_no_indep_rw", "false"); + fd->hints->no_indep_rw = PNCIO_HINT_DISABLE; + } + + /* user intends to call collective I/O APIs only */ + Info_check_and_install_true(fd, users_info, "romio_no_indep_rw", + &fd->hints->no_indep_rw); + if (fd->hints->no_indep_rw == 1) { + /* if 'no_indep_rw' set, also hint that we will do + * collective buffering: if we aren't doing independent io, + * then we have to do collective */ + MPI_Info_set(fd->info, "romio_cb_write", "enable"); + MPI_Info_set(fd->info, "romio_cb_read", "enable"); + fd->hints->cb_read = PNCIO_HINT_ENABLE; + fd->hints->cb_write = PNCIO_HINT_ENABLE; + } + + /* enable/disable data sieving */ + Info_check_and_install_enabled(fd, users_info, "romio_ds_read", + &fd->hints->ds_read); + Info_check_and_install_enabled(fd, users_info, "romio_ds_write", + &fd->hints->ds_write); + + /* number of I/O aggregators */ + Info_check_and_install_int(fd, users_info, "cb_nodes", + &fd->hints->cb_nodes); + /* check ill value */ + if (fd->hints->cb_nodes > 0 && fd->hints->cb_nodes <= nprocs) { + snprintf(value, MPI_MAX_INFO_VAL + 1, "%d", fd->hints->cb_nodes); + MPI_Info_set(fd->info, "cb_nodes", value); + } + else { + fd->hints->cb_nodes = 0; + MPI_Info_set(fd->info, "cb_nodes", "0"); + } + + Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size", + &fd->hints->ind_wr_buffer_size); + Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size", + &fd->hints->ind_rd_buffer_size); + + /* file striping configuration */ + Info_check_and_install_int(fd, users_info, "striping_unit", + &fd->hints->striping_unit); + + Info_check_and_install_int(fd, users_info, "striping_factor", + &fd->hints->striping_factor); + + Info_check_and_install_int(fd, users_info, "start_iodevice", + &fd->hints->start_iodevice); + + /* Lustre overstriping ratio. 0 or 1 means disabled */ + Info_check_and_install_int(fd, users_info, "lustre_overstriping_ratio", + &fd->hints->fs_hints.lustre.overstriping_ratio); + + /* PnetCDF ignores the following hints. + * cb_config_list + * deferred_open + */ + + return NC_NOERR; +} + diff --git a/src/drivers/pncio/pncio_lock.c b/src/drivers/pncio/pncio_lock.c new file mode 100644 index 000000000..a78d181db --- /dev/null +++ b/src/drivers/pncio/pncio_lock.c @@ -0,0 +1,156 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +#include + +static +const char *GEN_flock_cmd_to_string(int cmd) +{ + switch (cmd) { +#ifdef F_GETLK64 + case F_GETLK64: + return "F_GETLK64"; +#else + case F_GETLK: + return "F_GETLK"; +#endif +#ifdef F_SETLK64 + case F_SETLK64: + return "F_SETLK64"; +#else + case F_SETLK: + return "F_SETLK"; +#endif +#ifdef F_SETLKW64 + case F_SETLKW64: + return "F_SETLKW64"; +#else + case F_SETLKW: + return "F_SETLKW"; +#endif + default: + return "UNEXPECTED"; + } +} + +static +const char *GEN_flock_type_to_string(int type) +{ + switch (type) { + case F_RDLCK: + return "F_RDLCK"; + case F_WRLCK: + return "F_WRLCK"; + case F_UNLCK: + return "F_UNLOCK"; + default: + return "UNEXPECTED"; + } +} + +int PNCIO_GEN_SetLock(PNCIO_File *fd, int cmd, int type, MPI_Offset offset, int whence, + MPI_Offset len) +{ + FDTYPE fd_sys = fd->fd_sys; + int err, error_code, err_count = 0, sav_errno; + struct flock lock; + + if (len == 0) + return MPI_SUCCESS; + + + /* Depending on the compiler flags and options, struct flock + * may not be defined with types that are the same size as + * MPI_Offsets. */ +/* FIXME: This is a temporary hack until we use flock64 where + available. It also doesn't fix the broken Solaris header sys/types.h + header file, which declares off_t as a UNION ! Configure tests to + see if the off64_t is a union if large file support is requested; + if so, it does not select large file support. +*/ +#ifdef NEEDS_INT_CAST_WITH_FLOCK + lock.l_type = type; + lock.l_start = (int) offset; + lock.l_whence = whence; + lock.l_len = (int) len; +#else + lock.l_type = type; + lock.l_whence = whence; + lock.l_start = offset; + lock.l_len = len; +#endif + + sav_errno = errno; /* save previous errno in case we recover from retryable errors */ + errno = 0; + do { + err = fcntl(fd_sys, cmd, &lock); + } while (err && ((errno == EINTR) || ((errno == EINPROGRESS) && (++err_count < 10000)))); + + if (err && (errno != EBADF)) { + /* FIXME: This should use the error message system, + * especially for MPICH */ + fprintf(stderr, + "This requires fcntl(2) to be implemented. As of 8/25/2011 it is not. Generic MPICH Message: File locking failed in PNCIO_GEN_SetLock(fd %X,cmd %s/%X,type %s/%X,whence %X) with return value %X and errno %X.\n" + "- If the file system is NFS, you need to use NFS version 3, ensure that the lockd daemon is running on all the machines, and mount the directory with the 'noac' option (no attribute caching).\n" + "- If the file system is LUSTRE, ensure that the directory is mounted with the 'flock' option.\n", + fd_sys, GEN_flock_cmd_to_string(cmd), cmd, + GEN_flock_type_to_string(type), type, whence, err, errno); + perror("PNCIO_GEN_SetLock:"); + fprintf(stderr, "PNCIO_GEN_SetLock:offset %llu, length %llu\n", (unsigned long long) offset, + (unsigned long long) len); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + if (!err) /* report fcntl failure errno's (EBADF), otherwise */ + errno = sav_errno; /* restore previous errno in case we recovered from retryable errors */ + + error_code = (err == 0) ? MPI_SUCCESS : MPI_ERR_UNKNOWN; + return error_code; +} + +int PNCIO_GEN_SetLock64(PNCIO_File *fd, int cmd, int type, MPI_Offset offset, int whence, + MPI_Offset len) +{ + FDTYPE fd_sys = fd->fd_sys; + int err, error_code; +#ifdef _LARGEFILE64_SOURCE + struct flock64 lock; +#else + struct flock lock; +#endif + + if (len == 0) + return MPI_SUCCESS; + + lock.l_type = type; + lock.l_start = offset; + lock.l_whence = whence; + lock.l_len = len; + + do { + err = fcntl(fd_sys, cmd, &lock); + } while (err && (errno == EINTR)); + + if (err && (errno != EBADF)) { + fprintf(stderr, + "File locking failed in PNCIO_GEN_SetLock64(fd %X,cmd %s/%X,type %s/%X,whence %X) with return value %X and errno %X.\n" + "If the file system is NFS, you need to use NFS version 3, ensure that the lockd daemon is running on all the machines, and mount the directory with the 'noac' option (no attribute caching).\n", + fd_sys, GEN_flock_cmd_to_string(cmd), cmd, + GEN_flock_type_to_string(type), type, whence, err, errno); + perror("PNCIO_GEN_SetLock64:"); + fprintf(stderr, "PNCIO_GEN_SetLock:offset %llu, length %llu\n", (unsigned long long) offset, + (unsigned long long) len); + MPI_Abort(MPI_COMM_WORLD, 1); + } + + error_code = (err == 0) ? MPI_SUCCESS : MPI_ERR_UNKNOWN; + return error_code; +} diff --git a/src/drivers/pncio/pncio_lustre_open.c b/src/drivers/pncio/pncio_lustre_open.c new file mode 100644 index 000000000..d9a1692a6 --- /dev/null +++ b/src/drivers/pncio/pncio_lustre_open.c @@ -0,0 +1,1138 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include /* open(), O_CREAT */ +#include /* open() */ +#include /* dirname() */ + +#ifdef HAVE_LIMITS_H +#include +#endif +#ifndef PATH_MAX +#define PATH_MAX 65535 +#endif + +#ifdef HAVE_SYS_STAT_H +#include /* open(), fstat() */ +#endif + +#include + +#include "pncio.h" + +#ifdef MIMIC_LUSTRE +#define xstr(s) str(s) +#define str(s) #s +#define STRIPE_SIZE 64 +#define STRIPE_COUNT 4 +#endif + +#ifdef HAVE_LUSTRE +/* /usr/include/lustre/lustreapi.h + * /usr/include/linux/lustre/lustre_user.h + */ +#include + +#define PNETCDF_LUSTRE_DEBUG +// #define PNETCDF_LUSTRE_DEBUG_VERBOSE + +#define PATTERN_STR(pattern, int_str) ( \ + (pattern == LLAPI_LAYOUT_DEFAULT) ? "LLAPI_LAYOUT_DEFAULT" : \ + (pattern == LLAPI_LAYOUT_RAID0) ? "LLAPI_LAYOUT_RAID0" : \ + (pattern == LLAPI_LAYOUT_WIDE) ? "LLAPI_LAYOUT_WIDE" : \ + (pattern == LLAPI_LAYOUT_MDT) ? "LLAPI_LAYOUT_MDT" : \ + (pattern == LLAPI_LAYOUT_OVERSTRIPING) ? "LLAPI_LAYOUT_OVERSTRIPING" : \ + (pattern == LLAPI_LAYOUT_SPECIFIC) ? "LLAPI_LAYOUT_SPECIFIC" : \ + int_str) + +#define PRINT_LAYOUT(val) { \ + char int_str[32]; \ + snprintf(int_str, 32, "%lu", val); \ + printf("\t%-14s = %-25s (0x%lx)\n",#val,PATTERN_STR(val, int_str),val); \ +} + +/*----< get_total_avail_osts() >---------------------------------------------*/ +static +int get_total_avail_osts(const char *filename) +{ + char *dirc=NULL, *dname, *tail, **members=NULL, *buffer=NULL; + char pool_name[64], fsname[64], full_pool_name[128]; + int err, dd, num_members=0; + int max_members = 2048; /* Maximum number of members to retrieve */ + int buffer_size = 1048576; /* Buffer size for member names */ + struct llapi_layout *layout=NULL; + + dirc = NCI_Strdup(filename); + + struct stat sb; + if (stat(filename, &sb) == 0 && S_ISDIR(sb.st_mode)) + dname = dirc; + else + /* find the parent folder name */ + dname = dirname(dirc); + + dd = open(dname, O_RDONLY, 0600); + if (dd < 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + fprintf(stderr,"Error at %s (%d) fails to open folder %s (%s)\n", + __FILE__,__LINE__, dname, strerror(errno)); +#endif + goto err_out; + } + + /* obtain Lustre layout object */ + layout = llapi_layout_get_by_fd(dd, LLAPI_LAYOUT_GET_COPY); + if (layout == NULL) { +#ifdef PNETCDF_LUSTRE_DEBUG + fprintf(stderr,"Error at %s (%d) llapi_layout_get_by_fd() fails (%s)\n", + __FILE__, __LINE__,strerror(errno)); +#endif + goto err_out; + } + + /* find the pool name */ + err = llapi_layout_pool_name_get(layout, pool_name, sizeof(pool_name)-1); + if (err < 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + fprintf(stderr,"Error at %s (%d) llapi_layout_pool_name_get() fails (%s)\n", + __FILE__, __LINE__,strerror(errno)); +#endif + goto err_out; + } + else if (pool_name[0] == '\0') { +#ifdef PNETCDF_LUSTRE_DEBUG + fprintf(stderr,"%s at %d: %s has NO Pool Name\n",__FILE__, __LINE__,dname); +#endif + goto err_out; + } + /* For example, Perlmutter @NERSC, pool_name "original" is returned */ + + /* Using pool_name returned from llapi_layout_pool_name_get() is not enough + * when calling llapi_get_poolmembers(). We need to prepend it with + * 'fsname', which can be obtained by calling llapi_getname(). Note that + * console command 'lfs getname -n' returns fsname. For example, on + * Perlmutter @NERSC: + * login39::~/Lustre(12:52) #1165 lfs getname -n $SCRATCH/dummy + * scratch + */ + err = llapi_getname(dname, fsname, 63); + if (err < 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + fprintf(stderr,"Error at %s (%d) llapi_getname() fails (%s)\n", + __FILE__, __LINE__,strerror(errno)); +#endif + goto err_out; + } + + /* When dname is a folder, fsname returned from llapi_getname() may contain + * a trailing ID, e.g. scratch-ffff9ca88d9bd800. Must remove the trailing + * ID, otherwise llapi_get_poolmembers() is not able to find it. + */ + tail = strchr(fsname, '-'); + if (tail != NULL) *tail = '\0'; + + /* In case either pool_name and fsname are empty. For example, on Polaris + * @ALCF, the returned pool_name is empty, but fsname is not. + */ + if (pool_name[0] == '\0' && fsname[0] == '\0') + goto err_out; + else if (pool_name[0] == '\0') + strcpy(full_pool_name, fsname); + else if (fsname[0] == '\0') + strcpy(full_pool_name, pool_name); + else + sprintf(full_pool_name, "%s.%s", fsname, pool_name); + +#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE + printf("%s at %d: file=%s dir=%s pool=%s fsname=%s full_pool_name=%s\n", + __func__,__LINE__, filename,dname,pool_name,fsname,full_pool_name); +#endif + + /* Allocate memory for the members and buffer */ + members = (char **)NCI_Malloc(max_members * sizeof(char *)); + buffer = (char *)NCI_Malloc(buffer_size); + + /* obtain pool's info */ + num_members = llapi_get_poolmembers(full_pool_name, members, max_members, + buffer, buffer_size); +#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE + if (num_members > 0) { + int i, min_nmembers = MIN(num_members, 10); + printf("%s at %d: Found %d members for pool '%s':\n", + __func__,__LINE__,num_members, pool_name); + printf("\tFirst %d OSTs and last are\n",min_nmembers); + for (i=0; i= 0) close(dd); + if (layout != NULL) llapi_layout_free(layout); + if (dirc != NULL) NCI_Free(dirc); + if (buffer != NULL) NCI_Free(buffer); + if (members != NULL) NCI_Free(members); + + return num_members; +} + +static +int compare(const void *a, const void *b) +{ + if (*(uint64_t*)a > *(uint64_t*)b) return (1); + if (*(uint64_t*)a < *(uint64_t*)b) return (-1); + return (0); +} + +static +int sort_ost_ids(struct llapi_layout *layout, + uint64_t stripe_count, + uint64_t *osts) +{ + uint64_t i, numOSTs; + + for (i=0; i osts[numOSTs]) + osts[++numOSTs] = osts[i]; + + return (numOSTs + 1); +} + +/*----< get_striping() >-----------------------------------------------------*/ +static +uint64_t get_striping(int fd, + const char *path, + uint64_t *pattern, + uint64_t *stripe_count, + uint64_t *stripe_size, + uint64_t *start_iodevice) +{ + int err; + struct llapi_layout *layout; + uint64_t *osts=NULL, numOSTs=0; +#ifdef PNETCDF_LUSTRE_DEBUG + char int_str[32]; +#endif + + *pattern = LLAPI_LAYOUT_RAID0; + *stripe_count = LLAPI_LAYOUT_DEFAULT; + *stripe_size = LLAPI_LAYOUT_DEFAULT; + *start_iodevice = LLAPI_LAYOUT_DEFAULT; + + layout = llapi_layout_get_by_fd(fd, LLAPI_LAYOUT_GET_COPY); + if (layout == NULL) { +#ifdef PNETCDF_LUSTRE_DEBUG + fprintf(stderr,"Error at %s (%d) llapi_layout_get_by_fd() fails\n", + __FILE__, __LINE__); +#endif + goto err_out; + } + + err = llapi_layout_pattern_get(layout, pattern); + if (err != 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + snprintf(int_str, 32, "%lu", *pattern); + fprintf(stderr,"Error at %s (%d) llapi_layout_pattern_get() fails to get patter %s\n", + __FILE__, __LINE__, PATTERN_STR(*pattern, int_str)); +#endif + goto err_out; + } + + /* obtain file striping count */ + err = llapi_layout_stripe_count_get(layout, stripe_count); + if (err != 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + snprintf(int_str, 32, "%lu", *stripe_count); + fprintf(stderr,"Error at %s (%d) llapi_layout_stripe_count_get() fails to get stripe count %s\n", + __FILE__, __LINE__, PATTERN_STR(*stripe_count, int_str)); +#endif + goto err_out; + } + + /* obtain file striping unit size */ + err = llapi_layout_stripe_size_get(layout, stripe_size); + if (err != 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + snprintf(int_str, 32, "%lu", *stripe_size); + fprintf(stderr,"Error at %s (%d) llapi_layout_stripe_size_get() fails to get stripe size %s\n", + __FILE__,__LINE__, PATTERN_STR(*stripe_size, int_str)); +#endif + goto err_out; + } + + /* /usr/include/linux/lustre/lustre_user.h + * The stripe size fields are shared for the extension size storage, + * however the extension size is stored in KB, not bytes. + * #define SEL_UNIT_SIZE 1024llu + * Therefore, the default stripe_size is (SEL_UNIT_SIZE * 1024) + */ + + if (*stripe_count == LLAPI_LAYOUT_DEFAULT || /* not set */ + *stripe_count == LLAPI_LAYOUT_INVALID || /* invalid */ + *stripe_count == LLAPI_LAYOUT_WIDE || /* all system's OSTs */ + *stripe_count > 1048576) { /* abnormally large number */ + return 0; + } + + /* obtain all OST IDs */ + osts = (uint64_t*) NCI_Malloc(sizeof(uint64_t) * (*stripe_count)); + if (llapi_layout_ost_index_get(layout, 0, &osts[0]) != 0) { + /* check if is a folder */ + struct stat path_stat; + fstat(fd, &path_stat); +#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE + if (S_ISREG(path_stat.st_mode)) /* not a regular file */ + printf("%s at %d: %s is a regular file\n",__func__,__LINE__,path); + else if (S_ISDIR(path_stat.st_mode)) + printf("%s at %d: %s is a folder\n",__func__,__LINE__,path); + else +#endif + if (!S_ISREG(path_stat.st_mode) && /* not a regular file */ + !S_ISDIR(path_stat.st_mode)) { /* not a folder */ +#ifdef PNETCDF_LUSTRE_DEBUG + fprintf(stderr,"Error at %s (%d) calling fstat() file %s (neither a regular file nor a folder)\n", \ + __FILE__, __LINE__, path); +#endif + goto err_out; + } + + *start_iodevice = LLAPI_LAYOUT_DEFAULT; + numOSTs = *stripe_count; + + goto err_out; + } + *start_iodevice = osts[0]; + + numOSTs = sort_ost_ids(layout, *stripe_count, osts); + assert(numOSTs <= *stripe_count); + +err_out: + if (osts != NULL) NCI_Free(osts); + if (layout != NULL) llapi_layout_free(layout); + + return numOSTs; +} + +/*----< set_striping() >-----------------------------------------------------*/ +static +int set_striping(const char *path, + uint64_t pattern, + uint64_t numOSTs, + uint64_t stripe_count, + uint64_t stripe_size, + uint64_t start_iodevice) +{ + int fd=-1, err=0; + + struct llapi_layout *layout = llapi_layout_alloc(); + if (layout == NULL) { +#ifdef PNETCDF_LUSTRE_DEBUG + fprintf(stderr,"Error at %s (%d) llapi_layout_alloc() fails (%s)\n", + __FILE__, __LINE__, strerror(errno)); +#endif + goto err_out; + } + + /* When an abnormally large stripe_count is set by users, Lustre may just + * allocate the total number of available OSTs, instead of returning an + * error. + */ + err = llapi_layout_stripe_count_set(layout, stripe_count); + if (err != 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + fprintf(stderr,"Error at %s (%d) llapi_layout_stripe_count_set() fails set stripe count %lu (%s)\n", + __FILE__, __LINE__, stripe_count, strerror(errno)); +#endif + goto err_out; + } + + err = llapi_layout_stripe_size_set(layout, stripe_size); + if (err != 0) { +#ifdef PNETCDF_LUSTRE_DEBUG + fprintf(stderr,"Error at %s (%d) llapi_layout_stripe_size_set() fails to set strpe size %lu (%s)\n", + __FILE__, __LINE__, stripe_size, strerror(errno)); +#endif + goto err_out; + } + + if (pattern == LLAPI_LAYOUT_OVERSTRIPING) { + uint64_t i, ost_id; + if (start_iodevice == LLAPI_LAYOUT_DEFAULT) + start_iodevice = 0; + for (i=0; i------------------------------------------*/ +/* Construct the list of I/O aggregators. It sets the followings. + * fd->hints->cb_nodes and set file info for hint cb_nodes. + * fd->hints->ranklist[], an int array of size fd->hints->cb_nodes. + * fd->is_agg: indicating whether this rank is an I/O aggregator + * fd->my_cb_nodes_index: index into fd->hints->ranklist[]. -1 if N/A + */ +static +int Lustre_set_cb_node_list(PNCIO_File *fd) +{ + int i, j, k, rank, nprocs, num_aggr, striping_factor; + int *nprocs_per_node, **ranks_per_node; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &rank); + + /* number of MPI processes running on each node */ + nprocs_per_node = (int *) NCI_Calloc(fd->num_nodes, sizeof(int)); + + for (i=0; inode_ids[i]]++; + + /* construct rank IDs of MPI processes running on each node */ + ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->num_nodes); + ranks_per_node[0] = (int *) NCI_Malloc(sizeof(int) * nprocs); + for (i=1; inum_nodes; i++) + ranks_per_node[i] = ranks_per_node[i - 1] + nprocs_per_node[i - 1]; + + for (i=0; inum_nodes; i++) nprocs_per_node[i] = 0; + + /* Populate ranks_per_node[], list of MPI ranks running on each node. + * Populate nprocs_per_node[], number of MPI processes on each node. + */ + for (i=0; inode_ids[i]; + ranks_per_node[k][nprocs_per_node[k]] = i; + nprocs_per_node[k]++; + } + + /* To save a call to MPI_Bcast(), all processes run the same codes below to + * calculate num_aggr, the number of aggregators (later becomes cb_nodes). + * + * The calculation is based on the number of compute nodes, fd->num_nodes, + * and processes per node, nprocs_per_node. At this moment, all processes + * should have obtained the Lustre file striping settings. + */ + striping_factor = fd->hints->striping_factor; + + if (striping_factor > nprocs) { + /* When number of MPI processes is less than striping_factor, set + * num_aggr to the max number less than nprocs that divides + * striping_factor. An naive way is: + * num_aggr = nprocs; + * while (striping_factor % num_aggr > 0) + * num_aggr--; + * Below is equivalent, but faster. + */ + int divisor = 2; + num_aggr = 1; + /* try to divide */ + while (striping_factor >= divisor * divisor) { + if ((striping_factor % divisor) == 0) { + if (striping_factor / divisor <= nprocs) { + /* The value is found ! */ + num_aggr = striping_factor / divisor; + break; + } + /* if divisor is less than nprocs, divisor is a solution, + * but it is not sure that it is the best one + */ + else if (divisor <= nprocs) + num_aggr = divisor; + } + divisor++; + } + } + else { /* striping_factor <= nprocs */ + /* Select striping_factor processes to be I/O aggregators. Note this + * also applies to collective reads to allow more/less aggregators. In + * most cases, more aggregators yields better read performance. + */ + if (fd->hints->cb_nodes == 0) { + /* User did not set hint "cb_nodes" */ + if (nprocs >= striping_factor * 8 && nprocs/fd->num_nodes >= 8) + num_aggr = striping_factor * 8; + else if (nprocs >= striping_factor * 4 && nprocs/fd->num_nodes >= 4) + num_aggr = striping_factor * 4; + else if (nprocs >= striping_factor * 2 && nprocs/fd->num_nodes >= 2) + num_aggr = striping_factor * 2; + else + num_aggr = striping_factor; + } + else if (fd->hints->cb_nodes <= striping_factor) { + /* User has set hint cb_nodes and cb_nodes <= striping_factor. + * Ignore user's hint and try to set cb_nodes to be at least + * striping_factor. + */ + num_aggr = striping_factor; + } + else { + /* User has set hint cb_nodes and cb_nodes > striping_factor */ + if (nprocs < fd->hints->cb_nodes) + num_aggr = nprocs; /* BAD cb_nodes set by users */ + else + num_aggr = fd->hints->cb_nodes; + } + + /* Number of processes per node may not be enough to be picked as + * aggregators. If this case, reduce num_aggr (cb_nodes). Consider the + * following case: + * number of nodes = 7, + * number of processes = 18, + * striping_factor = 8, + * cb_nodes = 16. + * Nodes in this case, nodes 0, 1, 2, 3 run 3 processes each and nodes + * 4, 5, 6 run 2 processes each. In order to keep each OST only + * accessed by one or more aggregators running on the same compute + * node, cb_nodes should be reduced to 8. Thus the ranks of aggregators + * become 0, 3, 6, 9, 12, 14, 16, 1. The aggregator-OST mapping + * becomes below. + * Aggregator 0, running on node 0, access OST 0. + * Aggregator 3, running on node 1, access OST 1. + * Aggregator 6, running on node 2, access OST 2. + * Aggregator 9, running on node 3, access OST 3. + * Aggregator 12, running on node 4, access OST 4. + * Aggregator 14, running on node 5, access OST 5. + * Aggregator 16, running on node 6, access OST 6. + * Aggregator 1, running on node 0, access OST 7. + * + * Another case (the total number of processes changes to 25): + * number of nodes = 7, + * number of processes = 25, + * striping_factor = 8, + * cb_nodes = 16. + * In this case, nodes 0, 1, 2, 3 run 4 processes each and nodes 4, 5, + * 6 run 3 processes each. cb_nodes should remain 16 and the ranks of + * aggregators become 0, 4, 8, 12, 16, 19, 22, 1, 2, 6, 10, 14, 18, 21, + * 24, 3. The aggregator-OST mapping becomes below. + * Aggregators 0, 2, running on node 0, access OST 0. + * Aggregators 4, 6, running on node 1, access OST 1. + * Aggregators 8, 10, running on node 2, access OST 2. + * Aggregators 12, 14, running on node 3, access OST 3. + * Aggregators 16, 18, running on node 4, access OST 4. + * Aggregators 19, 21, running on node 5, access OST 5. + * Aggregators 22, 24, running on node 6, access OST 6. + * Aggregator 3, running on node 0, access OST 7. + */ + int max_nprocs_node = 0; + for (i=0; inum_nodes; i++) + max_nprocs_node = MAX(max_nprocs_node, nprocs_per_node[i]); + int max_naggr_node = striping_factor / fd->num_nodes; + if (striping_factor % fd->num_nodes) max_naggr_node++; + /* max_naggr_node is the max number of processes per node to be picked + * as aggregator in each round. + */ + int rounds = num_aggr / striping_factor; + if (num_aggr % striping_factor) rounds++; + while (max_naggr_node * rounds > max_nprocs_node) rounds--; + num_aggr = striping_factor * rounds; + } + + /* TODO: the above setting for num_aggr is for collective writes. Should + * collective reads use the same? Or just set cb_nodes to the number of + * nodes. + */ + + /* Next step is to determine the MPI rank IDs of I/O aggregators and add + * them into ranklist[]. Note fd->hints->ranklist will be freed in + * PNCIO_File_close(). + */ + fd->hints->ranklist = (int *) NCI_Malloc(num_aggr * sizeof(int)); + if (fd->hints->ranklist == NULL) + return NC_ENOMEM; + + int block_assignment=0; +#ifdef TRY_AGGR_BLOCK_ASSIGNMENT + { + char *env_str; + if ((env_str = getenv("PNETCDF_USE_BLOCK_ASSIGN")) != NULL) + block_assignment = (strcasecmp(env_str, "true") == 0) ? 1 : 0; + if (rank == 0) + printf("%s %d: PNETCDF_USE_BLOCK_ASSIGN = %d\n", + __func__,__LINE__,block_assignment); + } +#endif + + if (striping_factor <= fd->num_nodes) { + /* When number of OSTs is less than number of compute nodes, first + * select number of nodes equal to the number of OSTs by spread the + * selection evenly across all compute nodes (i.e. with a stride + * between every 2 consecutive nodes). + * Selection of MPI ranks can be done in 2 ways. + * 1. block assignment + * Select ranks from a node and then move on to the next node. + * 2. cyclic assignment + * Select ranks round-robin across all selected nodes. + * Note when selecting ranks within a node, the ranks are evenly spread + * among all processes in the node. + */ + if (block_assignment) { + int n=0; + int remain = num_aggr % striping_factor; + int node_stride = fd->num_nodes / striping_factor; + /* walk through each node and pick aggregators */ + for (j=0; jnum_nodes; j+=node_stride) { + /* Selecting node IDs with a stride. j is the node ID */ + int nranks_per_node = num_aggr / striping_factor; + /* front nodes may have 1 more to pick */ + if (remain > 0 && j/node_stride < remain) nranks_per_node++; + int rank_stride = nprocs_per_node[j] / nranks_per_node; + for (k=0; khints->ranklist[n] = ranks_per_node[j][k*rank_stride]; + if (++n == num_aggr) { + j = fd->num_nodes; /* break loop j */ + break; /* loop k */ + } + } + } + } + else { + int avg = num_aggr / striping_factor; + int stride = fd->num_nodes / striping_factor; + if (num_aggr % striping_factor) avg++; + for (i = 0; i < num_aggr; i++) { + /* j is the selected node ID. This selection is round-robin + * across selected nodes. + */ + j = (i % striping_factor) * stride; + k = (i / striping_factor) * (nprocs_per_node[j] / avg); + assert(k < nprocs_per_node[j]); + fd->hints->ranklist[i] = ranks_per_node[j][k]; + } + } + } + else { /* striping_factor > fd->num_nodes */ + /* When number of OSTs is more than number of compute nodes, I/O + * aggregators are selected from all nodes. Within each node, + * aggregators are spread evenly instead of the first few ranks. + */ + int *naggr_per_node, *idx_per_node, avg; + idx_per_node = (int*) NCI_Calloc(fd->num_nodes, sizeof(int)); + naggr_per_node = (int*) NCI_Malloc(fd->num_nodes * sizeof(int)); + for (i = 0; i < striping_factor % fd->num_nodes; i++) + naggr_per_node[i] = striping_factor / fd->num_nodes + 1; + for (; i < fd->num_nodes; i++) + naggr_per_node[i] = striping_factor / fd->num_nodes; + avg = num_aggr / striping_factor; + if (avg > 0) + for (i = 0; i < fd->num_nodes; i++) + naggr_per_node[i] *= avg; + for (i = 0; i < fd->num_nodes; i++) + naggr_per_node[i] = MIN(naggr_per_node[i], nprocs_per_node[i]); + /* naggr_per_node[] is the number of aggregators that can be + * selected as I/O aggregators + */ + + if (block_assignment) { + int n = 0; + for (j=0; jnum_nodes; j++) { + /* j is the node ID */ + int rank_stride = nprocs_per_node[j] / naggr_per_node[j]; + /* try stride==1 seems no effect, rank_stride = 1; */ + for (k=0; khints->ranklist[n] = ranks_per_node[j][k*rank_stride]; + if (++n == num_aggr) { + j = fd->num_nodes; /* break loop j */ + break; /* loop k */ + } + } + } + } + else { + for (i = 0; i < num_aggr; i++) { + int stripe_i = i % striping_factor; + j = stripe_i % fd->num_nodes; /* to select from node j */ + k = nprocs_per_node[j] / naggr_per_node[j]; + k *= idx_per_node[j]; + /* try stride==1 seems no effect, k = idx_per_node[j]; */ + idx_per_node[j]++; + assert(k < nprocs_per_node[j]); + fd->hints->ranklist[i] = ranks_per_node[j][k]; + } + } + NCI_Free(naggr_per_node); + NCI_Free(idx_per_node); + } + + /* TODO: we can keep these two arrays in case for dynamic construction + * of fd->hints->ranklist[], such as in group-cyclic file domain + * assignment method, used in each collective write call. + */ + NCI_Free(nprocs_per_node); + NCI_Free(ranks_per_node[0]); + NCI_Free(ranks_per_node); + + /* set file striping hints */ + fd->hints->cb_nodes = num_aggr; + + /* check whether this process is selected as an I/O aggregator */ + fd->is_agg = 0; + fd->my_cb_nodes_index = -1; + for (i = 0; i < num_aggr; i++) { + if (rank == fd->hints->ranklist[i]) { + fd->is_agg = 1; + fd->my_cb_nodes_index = i; + break; + } + } + + return 0; +} + +/*----< PNCIO_Lustre_create() >----------------------------------------------*/ +/* 1. root creates the file + * 2. root sets and obtains striping info + * 3. root broadcasts striping info + * 4. non-root processes receive striping info from root + * 5. non-root processes opens the fie + */ +int +PNCIO_Lustre_create(PNCIO_File *fd, + int mpi_io_mode) +{ + char int_str[16]; + int err=NC_NOERR, rank, perm, old_mask; + int stripin_info[4] = {-1, -1, -1, -1}; +#ifdef HAVE_LUSTRE + int total_num_OSTs; + uint64_t numOSTs, pattern, stripe_count, stripe_size, start_iodevice; +#endif + +#ifdef WKL_DEBUG +extern int first_ost_id; +first_ost_id = -1; +#endif + + MPI_Comm_rank(fd->comm, &rank); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +static int wkl=0; if (wkl == 0 && rank == 0) { printf("\nxxxx %s at %d: %s ---- %s\n",__func__,__LINE__,(fd->file_system == PNCIO_LUSTRE)?"PNCIO_LUSTRE":"PNCIO_UFS",fd->filename); wkl++; fflush(stdout);} +#endif + +#if defined(HAVE_LUSTRE) || defined(MIMIC_LUSTRE) +assert(mpi_io_mode & MPI_MODE_CREATE); + +/* Note ncmpi_create always creates a file with readable and writable permission. */ + int amode = O_CREAT; + if (mpi_io_mode & MPI_MODE_RDWR) amode |= O_RDWR; +#endif + + old_mask = umask(022); + umask(old_mask); + perm = old_mask ^ PNCIO_PERM; + + /* root process creates the file first, followed by all processes open the + * file. + */ + if (rank > 0) goto err_out; + + /* For Lustre, we need to obtain file striping info (striping_factor, + * striping_unit, and num_osts) in order to select the I/O aggregators + * in fd->hints->ranklist, no matter its is open or create mode. + */ + +#ifdef HAVE_LUSTRE + int overstriping_ratio, str_factor, str_unit, start_iodev; + + /* In a call to PNCIO_File_SetInfo() earlier, hints have been validated to + * be consistent among all processes. + */ + + str_unit = fd->hints->striping_unit; + str_factor = fd->hints->striping_factor; + start_iodev = fd->hints->start_iodevice; + overstriping_ratio = fd->hints->fs_hints.lustre.overstriping_ratio; + + /* obtain the total number of OSTs available */ + total_num_OSTs = get_total_avail_osts(fd->filename); + if (total_num_OSTs <= 0) /* failed to obtain number of available OSTs */ + total_num_OSTs = PNCIO_LUSTRE_MAX_OSTS; + if (str_factor > total_num_OSTs) + str_factor = total_num_OSTs; + + numOSTs=0; + pattern = LLAPI_LAYOUT_DEFAULT; + stripe_count = LLAPI_LAYOUT_DEFAULT; + stripe_size = LLAPI_LAYOUT_DEFAULT; + start_iodevice = LLAPI_LAYOUT_DEFAULT; + + fd->fd_sys = -1; + + /* When no file striping hint is set, their default values are: + * fd->hints->striping_factor = 0; + * fd->hints->striping_unit = 0; + * fd->hints->start_iodevice = -1; + * fd->hints->fs_hints.lustre.overstriping_ratio = 1; + */ + + /* In many cases, the Lustre striping configuration of the file to be + * created is not explicitly set by the users (through I/O hints + * striping_factor and striping_unit) or the striping configuration of + * parent folder to store the new file is not explicitly set by the users. + * + * Here, if application did not set the file striping hints, we set the new + * file's striping count to be equal to the number of compute nodes + * allocated to fd->comm and the striping size to 1 MiB. Inheriting the + * striping from the parent folder is disabled. But if inheritance is + * desired, this can be changed by defining macro INHERIT_DIR_STRIPING + * which enables the code block below. + * + * Note if the application explicitly set hints striping_factor and + * striping_unit, then they take precedence over the default. + */ +#ifdef INHERIT_DIR_STRIPING + /* Inherit the file striping settings of the folder. */ + + if (str_factor == 0 || str_unit == 0 || + (overstriping_ratio > 1 && start_iodev < 0)) { + /* When not all of the striping parameters are set by users, inherit + * those missing ones from the folder. + */ + int dd; + char *dirc, *dname; + dirc = NCI_Strdup(fd->filename); + dname = dirname(dirc); + + dd = open(dname, O_RDONLY, PNCIO_PERM); + + numOSTs = get_striping(dd, dname, &pattern, + &stripe_count, + &stripe_size, + &start_iodevice); + close(dd); + NCI_Free(dirc); + +#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE + printf("line %d: use parent folder's striping to set file's:\n",__LINE__); + PRINT_LAYOUT(numOSTs); + PRINT_LAYOUT(stripe_count); + PRINT_LAYOUT(stripe_size); + PRINT_LAYOUT(start_iodevice); + PRINT_LAYOUT(pattern); +#endif + /* in case of default striping setting is used */ + if (numOSTs == 0) numOSTs = 1; + } +#endif + + /* If hint striping_factor is not set by the user and the new file's folder + * has not set its striping parameters, then we set the number of unique + * OSTs, numOSTs, to the number of compute nodes allocated to this job, + * which sets stripe_count to (numOSTs * overstriping_ratio). + */ + if (str_factor == 0 && (stripe_count == LLAPI_LAYOUT_DEFAULT || + stripe_count == LLAPI_LAYOUT_WIDE)) { + stripe_count = MIN(fd->num_nodes, total_num_OSTs); + if (overstriping_ratio > 1) stripe_count *= overstriping_ratio; + } + else if (str_factor > 0) + stripe_count = str_factor; + + /* When overstriping is requested by the user, calculate the number of + * unique OSTs. + */ + if (overstriping_ratio > 1) { + pattern = LLAPI_LAYOUT_OVERSTRIPING; + if (stripe_count < overstriping_ratio) + numOSTs = 1; + else + numOSTs = stripe_count / overstriping_ratio; + } + /* If ill values are detected, fall back to no overstriping */ + if (overstriping_ratio <= 1 || numOSTs == stripe_count) { + numOSTs = stripe_count; + pattern = LLAPI_LAYOUT_RAID0; + } + + /* If user has not set hint striping_unit and the folder's striping size is + * also not set, then use the default. + */ + if (str_unit == 0 && stripe_size == LLAPI_LAYOUT_DEFAULT) + stripe_size = LLAPI_LAYOUT_DEFAULT; + else if (str_unit > 0) + stripe_size = str_unit; + + /* If user has not set hint start_iodevice and the folder's start_iodevice + * is also not set, then use the default. + */ + if (start_iodev == -1 && start_iodevice == LLAPI_LAYOUT_DEFAULT) + start_iodevice = LLAPI_LAYOUT_DEFAULT; + else if (start_iodev > 0) + start_iodevice = start_iodev; + +#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE + printf("\n\tAfter adjust striping parameters become:\n"); + PRINT_LAYOUT(numOSTs); + PRINT_LAYOUT(stripe_count); + PRINT_LAYOUT(stripe_size); + PRINT_LAYOUT(start_iodevice); + PRINT_LAYOUT(pattern); +#endif + + /* create a new file and set striping */ + fd->fd_sys = set_striping(fd->filename, pattern, + numOSTs, + stripe_count, + stripe_size, + start_iodevice); + + if (fd->fd_sys < 0) + /* If explicitly setting file striping failed, inherit the striping + * from the folder by simply creating the file. + */ + fd->fd_sys = open(fd->filename, amode, perm); + + if (fd->fd_sys < 0) { + fprintf(stderr,"Error at %s (%d) fails to create file %s (%s)\n", + __FILE__,__LINE__, fd->filename, strerror(errno)); + err = ncmpii_error_posix2nc("Lustre set striping"); + goto err_out; + } + + /* Obtain Lustre file striping parameters actually set. */ + numOSTs = get_striping(fd->fd_sys, fd->filename, &pattern, + &stripe_count, + &stripe_size, + &start_iodevice); + + stripin_info[0] = stripe_size; + stripin_info[1] = stripe_count; + stripin_info[2] = start_iodevice; + stripin_info[3] = numOSTs; + +#elif defined(MIMIC_LUSTRE) + fd->fd_sys = open(fd->filename, amode, perm); + if (fd->fd_sys == -1) { + fprintf(stderr,"%s line %d: rank %d fails to create file %s (%s)\n", + __FILE__,__LINE__, rank, fd->filename, strerror(errno)); + err = ncmpii_error_posix2nc("open"); + goto err_out; + } + + char *env_str = getenv("MIMIC_STRIPE_SIZE"); + if (env_str != NULL) + stripin_info[0] = atoi(env_str); + else + stripin_info[0] = STRIPE_SIZE; + stripin_info[1] = STRIPE_COUNT; + stripin_info[2] = 0; + stripin_info[3] = STRIPE_COUNT; +#endif + +err_out: + MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm); + if (fd->file_system == PNCIO_LUSTRE && + (stripin_info[0] == -1 || stripin_info[3] == 0)) { + fprintf(stderr, "%s line %d: failed to create Lustre file %s\n", + __FILE__, __LINE__, fd->filename); + return err; + } + + fd->hints->striping_unit = stripin_info[0]; + fd->hints->striping_factor = stripin_info[1]; + fd->hints->start_iodevice = stripin_info[2]; + if (fd->file_system == PNCIO_LUSTRE) { + fd->hints->fs_hints.lustre.num_osts = stripin_info[3]; + fd->hints->fs_hints.lustre.overstriping_ratio = stripin_info[1] / stripin_info[3]; + } + + if (rank > 0) { /* non-root processes */ + fd->fd_sys = open(fd->filename, O_RDWR, perm); + if (fd->fd_sys == -1) { + fprintf(stderr,"%s line %d: rank %d failure to open file %s (%s)\n", + __FILE__,__LINE__, rank, fd->filename, strerror(errno)); + return ncmpii_error_posix2nc("ioctl"); + } + } + + /* construct cb_nodes rank list */ + Lustre_set_cb_node_list(fd); + + MPI_Info_set(fd->info, "romio_filesystem_type", "LUSTRE:"); + + snprintf(int_str, 16, "%d", fd->hints->fs_hints.lustre.num_osts); + MPI_Info_set(fd->info, "lustre_num_osts", int_str); + + snprintf(int_str, 16, "%d", fd->hints->fs_hints.lustre.overstriping_ratio); + MPI_Info_set(fd->info, "lustre_overstriping_ratio", int_str); + + return err; +} + +/*----< PNCIO_Lustre_open() >------------------------------------------------*/ +/* 1. all processes open the file. + * 2. root obtains striping info and broadcasts to all others + */ +int +PNCIO_Lustre_open(PNCIO_File *fd) +{ + char int_str[16]; + int err=NC_NOERR, rank, perm, old_mask; + int stripin_info[4] = {1048576, -1, -1, -1}; + +#ifdef WKL_DEBUG +extern int first_ost_id; +first_ost_id = -1; +#endif + + MPI_Comm_rank(fd->comm, &rank); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +static int wkl=0; if (wkl == 0 && rank == 0) { printf("\nxxxx %s at %d: %s ---- %s\n",__func__,__LINE__,(fd->file_system == PNCIO_LUSTRE)?"PNCIO_LUSTRE":"PNCIO_UFS",fd->filename); wkl++; fflush(stdout);} +#endif + + old_mask = umask(022); + umask(old_mask); + perm = old_mask ^ PNCIO_PERM; + + int omode = (fd->access_mode & MPI_MODE_RDWR) ? O_RDWR : O_RDONLY; + + /* All processes open the file. */ + fd->fd_sys = open(fd->filename, omode, perm); + if (fd->fd_sys == -1) { + fprintf(stderr, "%s line %d: rank %d fails to open file %s (%s)\n", + __FILE__,__LINE__, rank, fd->filename, strerror(errno)); + err = ncmpii_error_posix2nc("open"); + goto err_out; + } + + /* Only root obtains the striping information and bcast to all other + * processes. + */ + if (rank == 0) { +#ifdef HAVE_LUSTRE + uint64_t numOSTs=0; + uint64_t pattern = LLAPI_LAYOUT_DEFAULT; + uint64_t stripe_count = LLAPI_LAYOUT_DEFAULT; + uint64_t stripe_size = LLAPI_LAYOUT_DEFAULT; + uint64_t start_iodevice = LLAPI_LAYOUT_DEFAULT; + + numOSTs = get_striping(fd->fd_sys, fd->filename, &pattern, + &stripe_count, + &stripe_size, + &start_iodevice); + + stripin_info[0] = stripe_size; + stripin_info[1] = stripe_count; + stripin_info[2] = start_iodevice; + stripin_info[3] = numOSTs; + +#elif defined(MIMIC_LUSTRE) + char *env_str = getenv("MIMIC_STRIPE_SIZE"); + if (env_str != NULL) + stripin_info[0] = atoi(env_str); + else + stripin_info[0] = STRIPE_SIZE; + stripin_info[1] = STRIPE_COUNT; + stripin_info[2] = 0; + stripin_info[3] = STRIPE_COUNT; +#endif + } + +err_out: + MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm); + fd->hints->striping_unit = stripin_info[0]; + fd->hints->striping_factor = stripin_info[1]; + fd->hints->start_iodevice = stripin_info[2]; + fd->hints->fs_hints.lustre.num_osts = stripin_info[3]; + fd->hints->fs_hints.lustre.overstriping_ratio = stripin_info[1] / stripin_info[3]; + + /* construct cb_nodes rank list */ + Lustre_set_cb_node_list(fd); + + MPI_Info_set(fd->info, "romio_filesystem_type", "LUSTRE:"); + + snprintf(int_str, 16, "%d", fd->hints->fs_hints.lustre.num_osts); + MPI_Info_set(fd->info, "lustre_num_osts", int_str); + + snprintf(int_str, 16, "%d", fd->hints->fs_hints.lustre.overstriping_ratio); + MPI_Info_set(fd->info, "lustre_overstriping_ratio", int_str); + + return err; +} + diff --git a/src/drivers/pncio/pncio_lustre_wrcoll.c b/src/drivers/pncio/pncio_lustre_wrcoll.c new file mode 100644 index 000000000..03b0a59e9 --- /dev/null +++ b/src/drivers/pncio/pncio_lustre_wrcoll.c @@ -0,0 +1,2389 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +static int use_alltoallw; + +#ifdef HAVE_MPI_LARGE_COUNT +#define MEMCPY_UNPACK(x, inbuf, start, count, outbuf) { \ + int _k; \ + char *_ptr = (inbuf); \ + MPI_Count *mem_ptrs = others_req[x].mem_ptrs + (start); \ + MPI_Offset *mem_lens = others_req[x].lens + (start); \ + for (_k=0; _khints->striping_unit; + + avail_bytes = (stripe_id + 1) * fd->hints->striping_unit - off; + if (avail_bytes < *len) { + /* The request [off, off+len) has only [off, off+avail_bytes) part + * falling into aggregator's file domain */ + *len = avail_bytes; + } + /* return the index to ranklist[] */ + return (stripe_id % fd->hints->cb_nodes); +} + +/*----< LUSTRE_Calc_my_req() >-----------------------------------------------*/ +/* calculates what portions of the read/write requests of this process fall + * into the file domains of all I/O aggregators. + * IN: fd->flat_file: this rank's flattened write requests + * fd->flat_file.count: number of noncontiguous offset-length file requests + * fd->flat_file.off[fd->flat_file.count] file offsets of individual + * noncontiguous requests. + * fd->flat_file.len[fd->flat_file.count] lengths of individual + * noncontiguous requests. + * IN: buf_is_contig: whether the write buffer is contiguous or not + * OUT: my_req_ptr[cb_nodes] offset-length pairs of this process's requests + * fall into the file domain of each aggregator + * OUT: buf_idx_ptr[cb_nodes] index pointing to the starting location in + * user_buf for data to be sent to each aggregator. + */ +static +void LUSTRE_Calc_my_req(PNCIO_File *fd, + int buf_is_contig, + PNCIO_Access **my_req_ptr, + MPI_Offset **buf_idx) +{ + int aggr, *aggr_ranks, cb_nodes; + MPI_Count i, l; + size_t nelems, alloc_sz; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset rem_len, avail_len, *avail_lens; +#else + int rem_len, avail_len, *avail_lens; +#endif + MPI_Offset curr_idx, off; + PNCIO_Access *my_req; + + cb_nodes = fd->hints->cb_nodes; + + /* my_req[i].count gives the number of contiguous requests of this process + * that fall in aggregator i's file domain (not process MPI rank i). + */ + my_req = (PNCIO_Access *) NCI_Calloc(cb_nodes, sizeof(PNCIO_Access)); + *my_req_ptr = my_req; + + /* First pass is just to calculate how much space is needed to allocate + * my_req. + */ +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(int) + sizeof(MPI_Offset); + aggr_ranks = (int*) NCI_Malloc(alloc_sz * fd->flat_file.count); + avail_lens = (MPI_Offset*) (aggr_ranks + fd->flat_file.count); +#else + alloc_sz = sizeof(int) * 2; + aggr_ranks = (int*) NCI_Malloc(alloc_sz * fd->flat_file.count); + avail_lens = aggr_ranks + fd->flat_file.count; +#endif + + /* Note that MPI standard (MPI 3.1 Chapter 13.1.1 and MPI 4.0 Chapter + * 14.1.1) requires that the typemap displacements of etype and + * filetype are non-negative and monotonically non-decreasing. This + * makes fd->flat_file.off[] to be monotonically non-decreasing. + */ + +/* +Alternative: especially for when fd->flat_file.count is large +1 This rank's aggregate file access region is from start_offset to end_offset. +2 start with the 1st aggregator ID and keep assign aggregator until next stripe. + This can avoid too many calls to LUSTRE_Calc_aggregator() +*/ + + /* nelems will be the number of offset-length pairs for my_req[] */ + nelems = 0; + for (i = 0; i < fd->flat_file.count; i++) { + /* short circuit offset/len processing if zero-byte read/write. */ + if (fd->flat_file.len[i] == 0) + continue; + + off = fd->flat_file.off[i]; + avail_len = fd->flat_file.len[i]; + /* LUSTRE_Calc_aggregator() modifies the value of 'avail_len' to the + * amount that is only covered by the aggr's file domain. The remaining + * (tail) will continue to be processed to determine to whose file + * domain it belongs. As LUSTRE_Calc_aggregator() can be expensive for + * large value of fd->flat_file.count, we keep a copy of the returned + * values of 'aggr' and 'avail_len' in aggr_ranks[] and avail_lens[] to + * be used in the next for loop (not next iteration). + * + * Note the returned value in 'aggr' is the index to ranklist[], i.e. + * the 'aggr'th element of array ranklist[], rather than the + * aggregator's MPI rank ID in fd->comm. + */ + aggr = LUSTRE_Calc_aggregator(fd, off, &avail_len); + aggr_ranks[i] = aggr; /* first aggregator ID of this request */ + avail_lens[i] = avail_len; /* length covered, may be < fd->flat_file.len[i] */ + assert(aggr >= 0 && aggr <= cb_nodes); + my_req[aggr].count++; /* increment for aggregator aggr */ + nelems++; /* true number of noncontiguous requests + * in terms of file domains */ + + /* rem_len is the amount of ith offset-length pair that is not covered + * by aggregator aggr's file domain. + */ + rem_len = fd->flat_file.len[i] - avail_len; + assert(rem_len >= 0); + + while (rem_len > 0) { + off += avail_len; /* move forward to first remaining byte */ + avail_len = rem_len; /* save remaining size, pass to calc */ + aggr = LUSTRE_Calc_aggregator(fd, off, &avail_len); + my_req[aggr].count++; + nelems++; + rem_len -= avail_len;/* reduce remaining length by amount from fd */ + } + } + + /* allocate space for buf_idx. + * buf_idx is relevant only if buftype is contiguous. buf_idx[i] gives the + * starting index in user_buf where data will be sent to aggregator 'i'. + * This allows sends to be done without extra buffer. + */ + if (buf_idx != NULL && buf_is_contig) { + buf_idx[0] = (MPI_Offset *) NCI_Malloc(nelems * sizeof(MPI_Offset)); + for (i = 1; i < cb_nodes; i++) + buf_idx[i] = buf_idx[i - 1] + my_req[i - 1].count; + } + + /* allocate space for my_req and its members offsets and lens */ +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) * 2; + my_req[0].offsets = (MPI_Offset*) NCI_Malloc(alloc_sz * nelems); + my_req[0].lens = my_req[0].offsets + my_req[0].count; + for (i=1; iflat_file.count; i++) { + /* short circuit offset/len processing if zero-byte read/write. */ + if (fd->flat_file.len[i] == 0) + continue; + + off = fd->flat_file.off[i]; + aggr = aggr_ranks[i]; + assert(aggr >= 0 && aggr <= cb_nodes); + avail_len = avail_lens[i]; + + l = my_req[aggr].count; + if (buf_idx != NULL && buf_is_contig) { + buf_idx[aggr][l] = curr_idx; + curr_idx += avail_len; + } + rem_len = fd->flat_file.len[i] - avail_len; + + /* Each my_req[i] contains the number of this process's noncontiguous + * requests that fall into aggregator aggr's file domain. + * my_req[aggr].offsets[] and my_req[aggr].lens store the offsets and + * lengths of the requests. + */ + my_req[aggr].offsets[l] = off; + my_req[aggr].lens[l] = avail_len; + my_req[aggr].count++; + + while (rem_len != 0) { + off += avail_len; + avail_len = rem_len; + aggr = LUSTRE_Calc_aggregator(fd, off, &avail_len); + assert(aggr >= 0 && aggr <= cb_nodes); + l = my_req[aggr].count; + if (buf_idx != NULL && buf_is_contig) { + buf_idx[aggr][l] = curr_idx; + curr_idx += avail_len; + } + rem_len -= avail_len; + + my_req[aggr].offsets[l] = off; + my_req[aggr].lens[l] = avail_len; + my_req[aggr].count++; + } + } + NCI_Free(aggr_ranks); +} + +/* LUSTRE_Calc_others_req() calculates what requests from each of other + * processes fall in this aggregator's file domain. + * IN: my_req[cb_nodes]: offset-length pairs of this rank's requests fall + * into each of aggregators + * OUT: count_others_req_per_proc[i]: number of noncontiguous requests of + * rank i that falls in this aggregator's file domain. + * OUT: others_req_ptr[nprocs]: requests of each of other ranks fall into + * this aggregator's file domain. + */ +static +void LUSTRE_Calc_others_req(PNCIO_File *fd, + const PNCIO_Access *my_req, + PNCIO_Access **others_req_ptr) +{ + int i, myrank, nprocs, do_alltoallv; + MPI_Count *count_my_req_per_proc, *count_others_req_per_proc; + PNCIO_Access *others_req; + size_t npairs, alloc_sz, pair_sz; + + /* first find out how much to send/recv and from/to whom */ + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + others_req = (PNCIO_Access *) NCI_Malloc(nprocs * sizeof(PNCIO_Access)); + *others_req_ptr = others_req; + + /* Use my_req[i].count (the number of noncontiguous requests fall in + * aggregator i's file domain) to set count_others_req_per_proc[j] (the + * number of noncontiguous requests from process j fall into this + * aggregator's file domain). + */ + count_my_req_per_proc = (MPI_Count *) NCI_Calloc(nprocs * 2, sizeof(MPI_Count)); + count_others_req_per_proc = count_my_req_per_proc + nprocs; + for (i=0; ihints->cb_nodes; i++) + count_my_req_per_proc[fd->hints->ranklist[i]] = my_req[i].count; + + MPI_Alltoall(count_my_req_per_proc, 1, MPI_COUNT, + count_others_req_per_proc, 1, MPI_COUNT, fd->comm); + + /* calculate total number of offset-length pairs to be handled by this + * aggregator, only aggregators will have non-zero number of pairs. + */ + npairs = 0; + for (i=0; inum_nodes > 0) ? (nprocs / fd->num_nodes > 48) : 0; +#else + do_alltoallv=0; +#endif + + if (do_alltoallv) { + MPI_Offset *r_off_buf=NULL, *s_off_buf=NULL; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *sendCounts, *recvCounts; + MPI_Aint *sdispls, *rdispls; + alloc_sz = sizeof(MPI_Count) * 2 + sizeof(MPI_Aint) * 2; + sendCounts = (MPI_Count*) NCI_Calloc(nprocs, alloc_sz); + recvCounts = sendCounts + nprocs; + sdispls = (MPI_Aint*) (recvCounts + nprocs); + rdispls = sdispls + nprocs; +#else + int *sendCounts, *recvCounts, *sdispls, *rdispls; + alloc_sz = sizeof(int) * 4; + sendCounts = (int*) NCI_Calloc(nprocs, alloc_sz); + recvCounts = sendCounts + nprocs; + sdispls = recvCounts + nprocs; + rdispls = sdispls + nprocs; +#endif + + /* prepare receive side */ + r_off_buf = others_req[0].offsets; + for (i=0; ihints->cb_nodes; i++) { + int dest = fd->hints->ranklist[i]; + sendCounts[dest] = my_req[i].count * pair_sz; + /* Note all my_req[*].offsets are allocated in a single malloc(). */ + sdispls[dest] = (char*)my_req[i].offsets - (char*)s_off_buf; + } + +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Alltoallv_c(s_off_buf, sendCounts, sdispls, MPI_BYTE, + r_off_buf, recvCounts, rdispls, MPI_BYTE, fd->comm); +#else + MPI_Alltoallv(s_off_buf, sendCounts, sdispls, MPI_BYTE, + r_off_buf, recvCounts, rdispls, MPI_BYTE, fd->comm); +#endif + + NCI_Free(sendCounts); + } + else { /* instead of using alltoall, use MPI_Issend and MPI_Irecv */ + int nreqs; + MPI_Request *requests = (MPI_Request *) + NCI_Malloc((nprocs + fd->hints->cb_nodes) * sizeof(MPI_Request)); + + nreqs = 0; + for (i = 0; i < nprocs; i++) { + if (others_req[i].count == 0) /* nothing to receive from rank i */ + continue; + + /* Note the memory address of others_req[i].lens is right after + * others_req[i].offsets. This allows the following recv call to + * receive both offsets and lens in a single call. + */ + if (i == myrank) { + /* send to self uses memcpy(), here + * others_req[i].count == my_req[fd->my_cb_nodes_index].count + */ + memcpy(others_req[i].offsets, + my_req[fd->my_cb_nodes_index].offsets, + my_req[fd->my_cb_nodes_index].count * pair_sz); + } + else { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Irecv_c(others_req[i].offsets, others_req[i].count*pair_sz, + MPI_BYTE, i, 0, fd->comm, &requests[nreqs++]); +#else + MPI_Irecv(others_req[i].offsets, others_req[i].count*pair_sz, + MPI_BYTE, i, 0, fd->comm, &requests[nreqs++]); +#endif + } + } + +#ifdef WKL_DEBUG +/* WRF hangs below when calling MPI_Waitall(), at running 16 nodes, 128 ranks + * per node on Perlmutter, when these 3 env variables are set: + * FI_UNIVERSE_SIZE = 2048 + * FI_CXI_DEFAULT_CQ_SIZE = 524288 + * FI_CXI_RX_MATCH_MODE = software + * + * Using MPI_Alltoallv seems to be able to avoid such hanging problem. (above) + */ +// MPI_Barrier(fd->comm); /* This barrier prevents the MPI_Waitall below from hanging !!! */ +#endif + + for (i=0; ihints->cb_nodes; i++) { + if (my_req[i].count == 0 || i == fd->my_cb_nodes_index) + continue; /* nothing to send or send to self */ + + /* Note the memory address of my_req[i].lens is right after + * my_req[i].offsets. This allows the following Issend call to + * send both offsets and lens in a single call. + */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Issend_c(my_req[i].offsets, my_req[i].count * pair_sz, MPI_BYTE, + fd->hints->ranklist[i], 0, fd->comm, &requests[nreqs++]); +#else + MPI_Issend(my_req[i].offsets, my_req[i].count * pair_sz, MPI_BYTE, + fd->hints->ranklist[i], 0, fd->comm, &requests[nreqs++]); +#endif + } + + if (nreqs) { +#ifdef HAVE_MPI_STATUSES_IGNORE + MPI_Waitall(nreqs, requests, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + MPI_Waitall(nreqs, requests, statuses); + NCI_Free(statuses); +#endif + } + NCI_Free(requests); + } +} + +MPI_Offset PNCIO_LUSTRE_WriteStridedColl(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + /* Uses a generalized version of the extended two-phase method described in + * "An Extended Two-Phase Method for Accessing Sections of Out-of-Core + * Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, + * (5)4:301--317, Winter 1996. + * http://www.mcs.anl.gov/home/thakur/ext2ph.ps + */ + + int i, j, nprocs, myrank; + int do_collect = 1, do_ex_wr; + MPI_Offset start_offset, end_offset; + MPI_Offset min_st_loc = -1, max_end_loc = -1; + MPI_Offset w_len=0; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + +// printf("%s %d: offset=%lld\n",__func__,__LINE__,offset); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +MPI_Barrier(fd->comm); +double curT = MPI_Wtime(); +#endif + + /* fd->flat_file contains a list of starting file offsets and lengths of + * write requests made by this rank. Similarly, buf_view contains a list of + * offset-length pairs describing the write buffer layout. Note as PnetCDF + * never re-uses a fileview or buffer view. + * + * Note that MPI standard (MPI 3.1 Chapter 13.1.1 and MPI 4.0 Chapter + * 14.1.1) requires that the typemap displacements of etype and filetype + * set by the user are non-negative and monotonically non-decreasing. This + * makes fd->flat_file.off[] to be monotonically non-decreasing. + * + * This rank's aggregate file access region is from start_offset to + * end_offset. Note: end_offset points to the last byte-offset to be + * accessed. E.g., if start_offset=0 and end_offset=99, then the aggregate + * file access region is of size 100 bytes. If this rank has no data to + * write, end_offset == (start_offset - 1) + */ + MPI_Offset one_off; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset one_len; +#else + int one_len; +#endif + + if (fd->flat_file.count == 0) { /* TODO: is fd->flat_file.count == 0? */ + /* whole file is visible */ + start_offset = offset; + end_offset = offset + buf_view.size - 1; + if (buf_view.size > 0) { /* no-zero sized request */ + /* setting fd->flat_file is necessary for constructing my_req */ + one_off = offset; + one_len = buf_view.size; + fd->flat_file.off = &one_off; + fd->flat_file.len = &one_len; + fd->flat_file.count = 1; + } + } + else { + start_offset = offset + fd->flat_file.off[0]; + end_offset = fd->flat_file.off[fd->flat_file.count-1] + + fd->flat_file.len[fd->flat_file.count-1] - 1; + } +/* + else if (fd->flat_file.count > 0) { + start_offset = offset + fd->flat_file.off[0]; + end_offset = fd->flat_file.off[fd->flat_file.count-1] + + fd->flat_file.len[fd->flat_file.count-1] - 1; + } + else { + start_offset = offset; + end_offset = offset + fd->flat_file.size - 1; + } +*/ +// if (myrank==0) printf("%s %d: fd->flat_file size=%lld count=%lld offset=%lld start_offset=%lld end_offset=%lld\n",__func__,__LINE__, fd->flat_file.size, fd->flat_file.count,offset,start_offset,end_offset); + + buf_view.idx = 0; + buf_view.rem = buf_view.size; + if (buf_view.count > 1) + buf_view.rem = buf_view.len[0]; + + if (fd->hints->cb_write == PNCIO_HINT_DISABLE) { + /* collective write is explicitly disabled by user */ + do_collect = 0; + } + else { + /* Calculate the aggregate access region of all ranks and check if + * write requests are interleaved among all ranks. + */ + int is_interleaved, large_indv_req = 1; + MPI_Offset striping_range, st_end[2], *st_end_all = NULL; + + /* Gather starting and ending file offsets of write requests from all + * ranks into st_end_all[]. Even indices of st_end_all[] are starting + * offsets, and odd indices are ending offsets. + */ + st_end[0] = start_offset; + st_end[1] = end_offset; + st_end_all = (MPI_Offset *) NCI_Malloc(nprocs * 2 * sizeof(MPI_Offset)); + MPI_Allgather(st_end, 2, MPI_OFFSET, st_end_all, 2, MPI_OFFSET, fd->comm); + + /* The loop below does the followings. + * 1. Calculate this rank's aggregate access region. + * 2. Check whether or not the requests are interleaved among all ranks. + * 3. Check whether there are LARGE individual requests. Here, "large" + * means a write range is > (striping_factor * striping_unit). In + * this case, independent write will perform faster than collective. + */ + striping_range = fd->hints->striping_unit * fd->hints->striping_factor; + is_interleaved = 0; + for (i = 0; i < nprocs * 2; i += 2) { + if (st_end_all[i] > st_end_all[i + 1]) { + /* process rank (i/2) has no data to write */ + continue; + } + min_st_loc = st_end_all[i]; + max_end_loc = st_end_all[i + 1]; + if (st_end_all[i+1] - st_end_all[i] < striping_range) + large_indv_req = 0; + j = i; /* j is the rank of making first non-zero request */ + i += 2; + break; + } + for (; i < nprocs * 2; i += 2) { + if (st_end_all[i] > st_end_all[i + 1]) { + /* process rank (i/2) has no data to write */ + continue; + } + if (st_end_all[i] < st_end_all[j+1]) { + /* start offset of process rank (i/2) is less than the end + * offset of process rank (i/2-1) + */ + is_interleaved = 1; + } + min_st_loc = MIN(st_end_all[i], min_st_loc); + max_end_loc = MAX(st_end_all[i + 1], max_end_loc); + if (st_end_all[i+1] - st_end_all[i] < striping_range) + large_indv_req = 0; + j = i; + } + NCI_Free(st_end_all); + +// if (myrank==0) printf("%s %d: do_collect=%d is_interleaved=%d buf_view size=%lld count=%lld is_contig=%d start_offset=%lld end_offset=%lld\n",__func__,__LINE__, do_collect,is_interleaved,buf_view.size,buf_view.count,buf_view.is_contig, start_offset,end_offset); + if (fd->hints->cb_write == PNCIO_HINT_ENABLE) { + /* explicitly enabled by user */ + do_collect = 1; + } + else if (fd->hints->cb_write == PNCIO_HINT_AUTO) { +// if (myrank==0) printf("%s %d: large_indv_req=%d cb_nodes=%d striping_factor=%d\n",__func__,__LINE__, large_indv_req,fd->hints->cb_nodes , fd->hints->striping_factor); + /* Check if collective write is actually necessary, only when + * cb_write hint is set to PNCIO_HINT_AUTO. + * + * Two typical access patterns can benefit from collective write. + * 1) access file regions of all processes are interleaved, and + * 2) the individual request sizes are not too big, i.e. no + * bigger than striping_range. Large individual requests may + * result in a high communication cost in order to + * redistribute requests from non-aggregators to I/O + * aggregators. + */ + if (nprocs == 1) + do_collect = 0; + else if (!is_interleaved && large_indv_req && + fd->hints->cb_nodes <= fd->hints->striping_factor) { + /* do independent write, if every rank's write range > + * striping_range and writes are not interleaved in file + * space + */ + do_collect = 0; + } + } + } + + /* If collective I/O is determined not necessary, use independent I/O */ + if (!do_collect) { + + if (buf_view.size == 0) /* zero-sized request */ + return 0; + + if (fd->flat_file.is_contig && buf_view.is_contig) { + /* both buffer and fileview are contiguous */ + if (fd->flat_file.count > 0) offset += fd->flat_file.off[0]; +#ifdef WKL_DEBUG + printf("%s %d: SWITCH to PNCIO_WriteContig !!!\n",__func__,__LINE__); +#endif + + return PNCIO_WriteContig(fd, buf, buf_view.size, offset); + } + +#ifdef WKL_DEBUG + printf("%s %d: SWITCH to PNCIO_LUSTRE_WriteStrided !!!\n", + __func__,__LINE__); +#endif + + return PNCIO_LUSTRE_WriteStrided(fd, buf, buf_view, offset); + } + + /* Now we are using collective I/O (two-phase I/O strategy) */ + +#ifdef ADJUST_STRIPING_UNIT + /* adjust striping_unit when striping_factor is twice or more than the + * number of compute nodes. Note cb_node is set to at least + * striping_factor, if nprocs >= striping_factor. Adjustment below is to + * let each aggregator to write to two or more consecutive OSTs, which can + * most likely improve the performance. This will still yield an effect of + * any one OST receiving write requests from aggregators running on only + * one compute node. + */ + int orig_striping_unit = fd->hints->striping_unit; + + if (fd->hints->striping_factor >= fd->num_nodes * 2) { + fd->hints->striping_unit *= (fd->hints->striping_factor / fd->num_nodes); + + if (fd->hints->cb_buffer_size < fd->hints->striping_unit) { + char value[MPI_MAX_INFO_VAL + 1]; + + fd->hints->cb_buffer_size = fd->hints->striping_unit; + sprintf(value, "%d", fd->hints->cb_buffer_size); + MPI_Info_set(fd->info, "cb_buffer_size", value); + if (fd->is_agg) { + NCI_Free(fd->io_buf); + fd->io_buf = (void*) NCI_Calloc(1, fd->hints->cb_buffer_size); + } + } +#ifdef WKL_DEBUG + if (myrank == 0) + printf("Warning: %s line %d: Change striping_unit from %d to %d\n", + __func__, __LINE__, orig_striping_unit, fd->hints->striping_unit); +#endif + } +#endif + + /* my_req[cb_nodes] is an array of access info, one for each I/O aggregator + * whose file domain has this rank's request. + */ + PNCIO_Access *my_req; + + /* others_req[nprocs] is an array of access info, one for each ranks, both + * aggregators and non-aggregators, whose write requests fall into this + * aggregator's file domain. others_req[] matters only for aggregators. + */ + PNCIO_Access *others_req; + MPI_Offset **buf_idx = NULL; + + if (buf_view.is_contig) + buf_idx = (MPI_Offset **) NCI_Malloc(fd->hints->cb_nodes * + sizeof(MPI_Offset*)); + + /* Calculate the portions of this rank's write requests that fall into the + * file domains of each I/O aggregator. No inter-process communication is + * performed in LUSTRE_Calc_my_req(). + */ + LUSTRE_Calc_my_req(fd, buf_view.is_contig, &my_req, buf_idx); + + if (fd->hints->ds_write != PNCIO_HINT_DISABLE) { + /* When data sieving is considered, below check the current file size + * first. If the aggregate access region of this collective write is + * beyond the current file size, then we can safely skip the read of + * the read-modify-write of data sieving. + */ + if (fd->is_agg) { + /* Obtain the current file size. Note an MPI_Allgather() has been + * called above to calculate the aggregate access region. Thus all + * prior independent I/O should have completed by now, so it is + * safe to call lseek() to query the file size. + */ + MPI_Offset cur_off, fsize; + + cur_off = lseek(fd->fd_sys, 0, SEEK_CUR); + fsize = lseek(fd->fd_sys, 0, SEEK_END); + /* Ignore the error, and proceed as if file size is very large. */ +#ifdef PNETCDF_DEBUG + if (fsize == -1) + fprintf(stderr, "%s at %d: lseek SEEK_END failed on file %s (%s)\n", + __func__,__LINE__, fd->filename, strerror(errno)); +#endif + fd->skip_read = (fsize >=0 && min_st_loc >= fsize); + + /* restore file pointer */ + lseek(fd->fd_sys, cur_off, SEEK_SET); + } + } + else + fd->skip_read = 1; + +// if (fd->is_agg && !fd->skip_read) { MPI_Offset fsize = lseek(fd->fd_sys, 0, SEEK_END); printf("%d: %s at %d: skip_read=%d min_st_loc=%lld fsize=%lld\n",myrank,__func__,__LINE__,fd->skip_read,min_st_loc,fsize); } + + /* For aggregators, calculate the portions of all other ranks' requests + * fall into this aggregator's file domain (note only I/O aggregators are + * assigned file domains). + * + * Inter-process communication is required to construct others_req[], + * including MPI_Alltoall, MPI_Issend, MPI_Irecv, and MPI_Waitall. + */ + LUSTRE_Calc_others_req(fd, my_req, &others_req); + + /* Two-phase I/O: first communication phase to exchange write data from all + * ranks to the I/O aggregators, followed by the write phase where only I/O + * aggregators write to the file. + * + * Unless MPI_Alltoallw() is used (when use_alltoallw is set to 1), there + * is no collective MPI communication beyond this point, as + * LUSTRE_Exch_and_write() calls only MPI_Issend, MPI_Irecv, and + * MPI_Waitall. Thus it is safe for those non-aggregators making zero-sized + * request to skip the call. + */ + + /* if this rank has data to write, then participate exchange-and-write */ + do_ex_wr = (buf_view.size == 0) ? 0 : 1; + use_alltoallw = 0; + +#ifdef USE_MPI_ALLTOALLW + { + /* When num_nodes < striping_factor, using MPI_Alltoallw in + * commit_comm_phase() is faster than MPI_Issend/MPI_Irecv ... ? + */ + char *env_str; + if ((env_str = getenv("PNETCDF_USE_ALLTOALLW")) != NULL) + use_alltoallw = (strcasecmp(env_str, "true") == 0) ? 1: 0; + } +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[1] += MPI_Wtime() - curT; +#endif + + if (do_ex_wr || fd->is_agg) + /* This rank participates exchange and write only when it has non-zero + * data to write or is an I/O aggregator + */ + w_len = LUSTRE_Exch_and_write(fd, buf, buf_view, others_req, my_req, + min_st_loc, max_end_loc, buf_idx); + + /* free all memory allocated */ + NCI_Free(others_req[0].offsets); + NCI_Free(others_req); + + if (buf_idx != NULL) { + NCI_Free(buf_idx[0]); + NCI_Free(buf_idx); + } + NCI_Free(my_req[0].offsets); + NCI_Free(my_req); + +#ifdef ADJUST_STRIPING_UNIT + /* restore the original striping_unit */ + fd->hints->striping_unit = orig_striping_unit; +#endif + + /* If this collective write is followed by an independent write, it's + * possible to have those subsequent writes on other processes race ahead + * and sneak in before the read-modify-write completes. We carry out a + * collective communication at the end here so no one can start independent + * I/O before collective I/O completes. + * + * need to do some gymnastics with the error codes so that if something + * went wrong, all processes report error, but if a process has a more + * specific error code, we can still have that process report the + * additional information + */ + /* optimization: if only one process performing I/O, we can perform + * a less-expensive Bcast. */ + if (fd->hints->cb_nodes == 1) + MPI_Bcast(&w_len, 1, MPI_OFFSET, fd->hints->ranklist[0], fd->comm); + else + MPI_Allreduce(MPI_IN_PLACE, &w_len, 1, MPI_OFFSET, MPI_MIN, fd->comm); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[0] += MPI_Wtime() - curT; +#endif + + /* w_len may not be the same as buf_view.size, because data sieving may + * write more than requested. + */ + return buf_view.size; +} + +static +void comm_phase_alltoallw(PNCIO_File *fd, + disp_len_list *send_list, /* [cb_nodes] */ + disp_len_list *recv_list) /* [nprocs] */ +{ + /* This subroutine performs the sam communication tasks as the below + * commit_comm_phase(), but using MPI_Alltoallw() instead of MPI_Issend and + * MPI_Irecv. + * + * It creates a datatype combining all displacement-length + * pairs in each element of send_list[]. The datatype is used when calling + * MPI_Issend to send write data to the I/O aggregators. Similarly, it + * creates a datatype combining all displacement-length pairs in each + * element of recv_list[] and uses it when calling MPI_Irecv or MPI_Recv + * to receive write data from all processes. + */ + int i, nprocs, rank; + size_t alloc_sz; + MPI_Datatype *sendTypes, *recvTypes; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &rank); + + /* calculate send/recv derived types metadata */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *sendCounts, *recvCounts; + MPI_Aint *sdispls, *rdispls; + alloc_sz = sizeof(MPI_Count) + sizeof(MPI_Aint); + sendCounts = (MPI_Count*) NCI_Calloc(nprocs * 2, alloc_sz); + sdispls = (MPI_Aint*) (sendCounts + (nprocs * 2)); +#else + int *sendCounts, *recvCounts, *sdispls, *rdispls; + alloc_sz = sizeof(int) * 2; + sendCounts = (int*) NCI_Calloc(nprocs * 2, alloc_sz); + sdispls = (int*) (sendCounts + (nprocs * 2)); +#endif + recvCounts = sendCounts + nprocs; + rdispls = sdispls + nprocs; + + /* allocate send/recv derived type arrays */ + sendTypes = (MPI_Datatype*)NCI_Malloc(sizeof(MPI_Datatype) * nprocs * 2); + recvTypes = sendTypes + nprocs; + + for (i=0; iis_agg && recv_list != NULL) { + for (i=0; ihints->cb_nodes; i++) { + /* check if nothing to send or if self */ + if (send_list[i].count == 0 || i == fd->my_cb_nodes_index) continue; + + int dest = fd->hints->ranklist[i]; + sendCounts[dest] = 1; + + /* combine reqs using new datatype */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Type_create_hindexed_c(send_list[i].count, send_list[i].len, + send_list[i].disp, MPI_BYTE, + &sendTypes[dest]); +#else + MPI_Type_create_hindexed(send_list[i].count, send_list[i].len, + send_list[i].disp, MPI_BYTE, + &sendTypes[dest]); +#endif + MPI_Type_commit(&sendTypes[dest]); + } + +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Alltoallw_c(MPI_BOTTOM, sendCounts, sdispls, sendTypes, + MPI_BOTTOM, recvCounts, rdispls, recvTypes, fd->comm); +#else + MPI_Alltoallw(MPI_BOTTOM, sendCounts, sdispls, sendTypes, + MPI_BOTTOM, recvCounts, rdispls, recvTypes, fd->comm); +#endif + + for (i=0; ihints->cb_nodes; i++) + send_list[i].count = 0; + + if (recv_list != NULL) + for (i = 0; i < nprocs; i++) + recv_list[i].count = 0; +} + +static +void commit_comm_phase(PNCIO_File *fd, + disp_len_list *send_list, /* [cb_nodes] */ + disp_len_list *recv_list) /* [nprocs] */ +{ + /* This subroutine creates a datatype combining all displacement-length + * pairs in each element of send_list[]. The datatype is used when calling + * MPI_Issend to send write data to the I/O aggregators. Similarly, it + * creates a datatype combining all displacement-length pairs in each + * element of recv_list[] and uses it when calling MPI_Irecv or MPI_Recv + * to receive write data from all processes. + */ + int i, nprocs, rank, nreqs; + MPI_Request *reqs; + MPI_Datatype sendType, recvType; +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + int j; + double dtype_time=MPI_Wtime(); +#endif + + if (use_alltoallw) + return comm_phase_alltoallw(fd, send_list, recv_list); + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &rank); + + nreqs = fd->hints->cb_nodes; + nreqs += (fd->is_agg) ? nprocs : 0; + reqs = (MPI_Request *)NCI_Malloc(sizeof(MPI_Request) * nreqs); + nreqs = 0; + + /* receiving part */ +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + /* recv buffer type profiling */ + int nrecvs=0; + MPI_Offset max_r_amnt=0, max_r_count=0; +#endif + + if (fd->is_agg && recv_list != NULL) { + for (i = 0; i < nprocs; i++) { + /* check if nothing to receive or if self */ + if (recv_list[i].count == 0 || i == rank) continue; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + MPI_Offset r_amnt=0; + for (j=0; jatomicity) { /* Blocking Recv */ + MPI_Status status; + MPI_Recv(MPI_BOTTOM, 1, recvType, i, 0, fd->comm, &status); + } + else + MPI_Irecv(MPI_BOTTOM, 1, recvType, i, 0, fd->comm, + &reqs[nreqs++]); + MPI_Type_free(&recvType); + } + } + + /* send reqs */ +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + /* send buffer type profiling */ + int nsends=0; + MPI_Offset max_s_amnt=0, max_s_count=0; +#endif + + for (i = 0; i < fd->hints->cb_nodes; i++) { + /* check if nothing to send or if self */ + if (send_list[i].count == 0 || i == fd->my_cb_nodes_index) continue; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + MPI_Offset s_amnt=0; + for (j=0; jhints->ranklist[i], 0, + fd->comm, &reqs[nreqs++]); + MPI_Type_free(&sendType); + } + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->write_timing[4] += MPI_Wtime() - dtype_time; + +/* + fd->write_counter[2] = MAX(fd->write_counter[2], nsends); + fd->write_counter[3] = MAX(fd->write_counter[3], nrecvs); + fd->write_counter[4] = MAX(fd->write_counter[4], max_r_amnt); + fd->write_counter[5] = MAX(fd->write_counter[5], max_s_amnt); + fd->write_counter[6] = MAX(fd->write_counter[6], max_r_count); + fd->write_counter[7] = MAX(fd->write_counter[7], max_s_count); +*/ +#endif + + if (nreqs > 0) { +#ifdef HAVE_MPI_STATUSES_IGNORE + MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE); +#else + MPI_Status *statuses = (MPI_Status *) + NCI_Malloc(nreqs * sizeof(MPI_Status)); + MPI_Waitall(nreqs, reqs, statuses); + NCI_Free(statuses); +#endif + } + + NCI_Free(reqs); + + /* clear send_list and recv_list for future reuse */ + for (i = 0; i < fd->hints->cb_nodes; i++) + send_list[i].count = 0; + + if (recv_list != NULL) + for (i = 0; i < nprocs; i++) + recv_list[i].count = 0; +} + +/*----< LUSTRE_Exch_and_write() >--------------------------------------------*/ +/* Each process sends all its write requests to I/O aggregators based on the + * file domain assignment to the aggregators. In this implementation, a file is + * first divided into stripes which are assigned to the aggregators in a + * round-robin fashion. The "exchange" of write data from non-aggregators to + * aggregators is carried out in 'ntimes' rounds. Each round covers an + * aggregate file region of size equal to the file stripe size times the number + * of I/O aggregators. The file writes are carried out in every 'nbufs' + * iterations, where 'nbufs' == cb_buffer_size / file stripe size. This approach + * is different from ROMIO's implementation as in MPICH 4.2.3. + * + * Other implementations developers are referring to the paper: Wei-keng Liao, + * and Alok Choudhary. "Dynamically Adapting File Domain Partitioning Methods + * for Collective I/O Based on Underlying Parallel File System Locking + * Protocols", in The Supercomputing Conference, 2008. + */ +static +MPI_Offset LUSTRE_Exch_and_write(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + PNCIO_Access *others_req, + PNCIO_Access *my_req, + MPI_Offset min_st_loc, + MPI_Offset max_end_loc, + MPI_Offset **buf_idx) +{ + char **write_buf = NULL, **recv_buf = NULL, **send_buf = NULL; + size_t alloc_sz; + int nprocs, myrank, nbufs, ibuf, batch_idx=0, cb_nodes, striping_unit; + MPI_Count i, j, m, ntimes; + MPI_Count **recv_size=NULL, **recv_count=NULL; + MPI_Count **recv_start_pos=NULL, *send_size; + MPI_Offset end_loc, req_off, iter_end_off, *off_list, step_size; + MPI_Offset *this_buf_idx=NULL; + off_len_list *srt_off_len = NULL; + disp_len_list *send_list = NULL, *recv_list = NULL; + MPI_Offset w_len, total_w_len=0; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + cb_nodes = fd->hints->cb_nodes; + striping_unit = fd->hints->striping_unit; + + /* The aggregate access region (across all processes) of this collective + * write starts from min_st_loc and ends at max_end_loc. The collective + * write is carried out in 'ntimes' rounds of two-phase I/O. Each round + * covers an aggregate file region of size 'step_size' written only by + * cb_nodes number of I/O aggregators. Note non-aggregators must also + * participate all ntimes rounds to send their requests to I/O aggregators. + * + * step_size = the number of I/O aggregators x striping_unit + * + * Note the number of write phases = ntimes / nbufs, as writes (and + * communication) are accumulated for nbufs rounds before flushed. + */ + step_size = (MPI_Offset)cb_nodes * striping_unit; + + /* align min_st_loc downward to the nearest file stripe boundary */ + min_st_loc -= min_st_loc % (MPI_Offset) striping_unit; + + /* ntimes is the number of rounds of two-phase I/O */ + ntimes = (max_end_loc - min_st_loc + 1) / step_size; + if ((max_end_loc - min_st_loc + 1) % step_size) + ntimes++; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->write_counter[0] = MAX(fd->write_counter[0], ntimes); +#endif + + /* collective buffer is divided into 'nbufs' sub-buffers. Each sub-buffer + * is of size equal to Lustre stripe size. Write data of non-aggregators + * are sent to aggregators and stored in aggregators' sub-buffers, one for + * each round. All nbufs sub-buffers are altogether flushed to file every + * nbufs rounds. + * + * fd->hints->cb_buffer_size, collective buffer size, for Lustre must be at + * least striping_unit. This requirement has been checked at the file + * open/create time when fd->io_buf is allocated. + * + * Note cb_buffer_size and striping_unit may also be adjusted earlier in + * PNCIO_LUSTRE_WriteStridedColl(). + */ + nbufs = fd->hints->cb_buffer_size / striping_unit; + assert(nbufs > 0); /* must at least 1 */ + + /* in case number of rounds is less than nbufs */ + nbufs = (ntimes < nbufs) ? (int)ntimes : nbufs; + + /* off_list[m] is the starting file offset of this aggregator's write + * region in iteration m (file domain of iteration m). This offset + * may not be aligned with file stripe boundaries. + * end_loc is the ending file offset of this aggregator's file domain. + */ + off_list = (MPI_Offset *) NCI_Malloc(ntimes * sizeof(MPI_Offset)); + end_loc = -1; + for (m = 0; m < ntimes; m++) + off_list[m] = max_end_loc; + for (i = 0; i < nprocs; i++) { +// if (myrank == 0) printf("%s at %d: others_req[%d] count=%lld\n",__func__,__LINE__, i,others_req[i].count); + for (j = 0; j < others_req[i].count; j++) { + req_off = others_req[i].offsets[j]; + m = (int) ((req_off - min_st_loc) / step_size); + off_list[m] = MIN(off_list[m], req_off); + end_loc = MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1)); + } + } +// if (myrank == 0) printf("%s at %d: end_loc=%lld nbufs=%d recv_list=%s\n",__func__,__LINE__, end_loc,nbufs,(recv_list==NULL)?"NULL":"NOT NULL"); + + /* Allocate displacement-length pair arrays, describing the send buffer. + * send_list[i].count: number displacement-length pairs. + * send_list[i].len: length in bytes. + * send_list[i].disp: displacement (send buffer address). + */ + send_list = (disp_len_list*) NCI_Malloc(sizeof(disp_len_list) * cb_nodes); + for (i = 0; i < cb_nodes; i++) { + send_list[i].count = 0; +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Count) * 2; + send_list[i].disp = (MPI_Count*) NCI_Malloc(alloc_sz * nbufs); + send_list[i].len = send_list[i].disp + nbufs; +#else + alloc_sz = sizeof(MPI_Aint) + sizeof(int); + send_list[i].disp = (MPI_Aint*) NCI_Malloc(alloc_sz * nbufs); + send_list[i].len = (int*) (send_list[i].disp + nbufs); +#endif + } + + /* end_loc >= 0 indicates this process has something to write to the file. + * Only I/O aggregators can have end_loc > 0. write_buf is the collective + * buffer and only matter for I/O aggregators. recv_buf is the buffer used + * only by aggregators to receive requests from non-aggregators. Its size + * may be larger then the file stripe size, in case when writes from + * non-aggregators overlap. In this case, it will be realloc-ed in + * LUSTRE_W_Exchange_data(). The received data is later copied over to + * write_buf, whose contents will be written to file. + */ + if (end_loc >= 0 && nbufs > 0) { + /* Allocate displacement-length pair arrays, describing the recv buffer. + * recv_list[i].count: number displacement-length pairs. + * recv_list[i].len: length in bytes. + * recv_list[i].disp: displacement (recv buffer address). + */ + assert(fd->is_agg); + + recv_list = (disp_len_list*) NCI_Malloc(sizeof(disp_len_list) * nprocs); + for (i = 0; i < nprocs; i++) { + recv_list[i].count = 0; +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Count) * 2; + recv_list[i].disp = (MPI_Count*) NCI_Malloc(alloc_sz * nbufs); + recv_list[i].len = recv_list[i].disp + nbufs; +#else + alloc_sz = sizeof(MPI_Aint) + sizeof(int); + recv_list[i].disp = (MPI_Aint*) NCI_Malloc(alloc_sz * nbufs); + recv_list[i].len = (int*) (recv_list[i].disp + nbufs); +#endif + } + + /* collective buffer was allocated at file open/create. For Lustre, its + * size must be at least striping_unit, which has been checked at the + * time fd->io_buf is allocated. + */ + assert(fd->io_buf != NULL); + + /* divide collective buffer into nbufs sub-buffers */ + write_buf = (char **) NCI_Malloc(nbufs * sizeof(char*)); + write_buf[0] = fd->io_buf; + + /* Similarly, receive buffer consists of nbufs sub-buffers */ + recv_buf = (char **) NCI_Malloc(nbufs * sizeof(char*)); + recv_buf[0] = (char *) NCI_Malloc(striping_unit); + + /* recv_count[j][i] is the number of off-len pairs to be received from + * each proc i in round j + */ + recv_count = (MPI_Count**) NCI_Malloc(3 * nbufs * sizeof(MPI_Count*)); + recv_count[0] = (MPI_Count*) NCI_Malloc(3 * nbufs * nprocs * sizeof(MPI_Count)); + + /* recv_size[j][i] is the receive size from proc i in round j */ + recv_size = recv_count + nbufs; + recv_size[0] = recv_count[0] + nbufs * nprocs; + + /* recv_start_pos[j][i] is the starting index of offset-length arrays + * pointed by others_req[i].curr for remote rank i in round j + */ + recv_start_pos = recv_size + nbufs; + recv_start_pos[0] = recv_size[0] + nbufs * nprocs; + + for (j = 1; j < nbufs; j++) { + write_buf[j] = write_buf[j-1] + striping_unit; + /* recv_buf[j] may be realloc in LUSTRE_W_Exchange_data() */ + recv_buf[j] = (char *) NCI_Malloc(striping_unit); + recv_count[j] = recv_count[j-1] + nprocs; + recv_size[j] = recv_size[j-1] + nprocs; + recv_start_pos[j] = recv_start_pos[j-1] + nprocs; + } + + /* srt_off_len consists of file offset-length pairs sorted in a + * monotonically non-decreasing order (required by MPI-IO standard) + * which is used when writing to the file + */ + srt_off_len = (off_len_list*) NCI_Malloc(nbufs * sizeof(off_len_list)); + } + + /* send_buf[] will be allocated in LUSTRE_W_Exchange_data(), when the use + * buffer is not contiguous. + */ + send_buf = (char **) NCI_Malloc(nbufs * sizeof(char*)); + + /* this_buf_idx contains indices to the user write buffer for sending this + * rank's write data to aggregators, one for each aggregator. It is used + * only when user buffer is contiguous. + */ + if (buf_view.is_contig) + this_buf_idx = (MPI_Offset *) NCI_Malloc(sizeof(MPI_Offset) * cb_nodes); + + /* array of data sizes to be sent to each aggregator in a 2-phase round */ + send_size = (MPI_Count *) NCI_Calloc(cb_nodes, sizeof(MPI_Count)); + + /* min_st_loc is the beginning file offsets of the aggregate access region + * of this collective write, and it has been downward aligned to the + * nearest file stripe boundary + * iter_end_off is the ending file offset of aggregate write region of + * iteration m, upward aligned to the file stripe boundary. + */ + iter_end_off = min_st_loc + step_size; + + ibuf = 0; + for (m = 0; m < ntimes; m++) { + MPI_Count range_size; + MPI_Offset range_off; + + /* Note that MPI standard (MPI 3.1 Chapter 13.1.1 and MPI 4.0 Chapter + * 14.1.1) requires that the typemap displacements of etype and + * filetype are non-negative and monotonically non-decreasing. This + * simplifies implementation a bit compared to reads. + */ + + /* Calculate what should be communicated. + * + * First, calculate the amount to be sent to each aggregator i, at this + * round m, by going through all offset-length pairs in my_req[i]. + * + * iter_end_off - ending file offset of aggregate write region of this + * round, and upward aligned to the file stripe + * boundary. Note the aggregate write region of this + * round starts from (iter_end_off-step_size) to + * iter_end_off, aligned with file stripe boundaries. + * send_size[i] - total size in bytes of this process's write data + * fall into aggregator i's FD in this round. + * recv_size[m][i] - size in bytes of data to be received by this + * aggregator from process i in round m. + * recv_count[m][i] - number of noncontiguous offset-length pairs from + * process i fall into this aggregator's write region + * in round m. + */ + for (i = 0; i < cb_nodes; i++) { + /* reset communication metadata to all 0s for this round */ + send_size[i] = 0; + + if (my_req[i].count == 0) continue; + /* my_req[i].count is the number of this rank's offset-length pairs + * to be sent to aggregator i + */ + + if (my_req[i].curr == my_req[i].count) + continue; /* done with aggregator i */ + + if (buf_view.is_contig) + /* buf_idx is used only when user buffer is contiguous. + * this_buf_idx[i] points to the starting offset of user + * buffer, buf, for amount of send_size[i] to be sent to + * aggregator i at this round. + */ + this_buf_idx[i] = buf_idx[i][my_req[i].curr]; + + /* calculate the send amount from this rank to aggregator i */ + for (j = my_req[i].curr; j < my_req[i].count; j++) { + if (my_req[i].offsets[j] < iter_end_off) + send_size[i] += my_req[i].lens[j]; + else + break; + } + + /* update my_req[i].curr to point to the jth offset-length + * pair of my_req[i], which will be used as the first pair in the + * next round of iteration. + */ + my_req[i].curr = j; + } + + /* range_off is the starting file offset of this aggregator's write + * region at this round (may not be aligned to stripe boundary). + * range_size is the size (in bytes) of this aggregator's write region + * for this round (whose size is always <= striping_unit). + */ + range_off = off_list[m]; + range_size = MIN(striping_unit - range_off % striping_unit, + end_loc - range_off + 1); + + /* Calculate the amount to be received from each process i at this + * round, by going through all offset-length pairs of others_req[i]. + */ + if (recv_count != NULL) { + for (i=0; iis_agg) fd->write_timing[3] += MPI_Wtime() - curT; +#endif + + /* free send_buf allocated in LUSTRE_W_Exchange_data() */ + for (j = 0; j < numBufs; j++) { + if (send_buf[j] != NULL) { + NCI_Free(send_buf[j]); + send_buf[j] = NULL; + } + } + if (!fd->is_agg) /* non-aggregators are done for this batch */ + continue; + + if (recv_list == NULL) /* this aggregator has nothing to write */ + continue; + + /* this aggregator unpacks the data in recv_buf[] into write_buf */ + if (end_loc >= 0) { + for (j = 0; j < numBufs; j++) { + char *buf_ptr = recv_buf[j]; + for (i = 0; i < nprocs; i++) { + if (recv_count[j][i] > 1 && i != myrank) { + /* When recv_count[j][i] == 1, this case has + * been taken care of earlier by receiving the + * message directly into write_buf. + */ + MEMCPY_UNPACK(i, buf_ptr, recv_start_pos[j][i], + recv_count[j][i], write_buf[j]); + buf_ptr += recv_size[j][i]; + } + } + } + } + + /* this aggregator writes to numBufs number of stripes */ + for (j=0; j 1, + * data sieving is not performed and holes have been found. In + * this case, srt_off_len[] is the list of sorted offset-length + * pairs describing noncontiguous writes. Now call writes for + * each offset-length pair. Note the offset-length pairs + * (represented by srt_off_len[j].off, srt_off_len[j].len, and + * srt_off_len[j].num) have been coalesced in + * LUSTRE_W_Exchange_data(). + */ +// printf("%s at %d: num=%d\n",__func__,__LINE__, srt_off_len[j].num); + for (i = 0; i < srt_off_len[j].num; i++) { + /* all write requests in this round should fall into file + * range of [range_off, range_off+range_size). This below + * assertion should never fail. + */ + assert(srt_off_len[j].off[i] < range_off + range_size && + srt_off_len[j].off[i] >= range_off); + +// printf("%s at %d: PNCIO_WriteContig num=%d [%d] off=%lld len=%lld\n",__func__,__LINE__, srt_off_len[j].num,i,srt_off_len[j].off[i],srt_off_len[j].len[i]); + w_len = PNCIO_WriteContig(fd, + write_buf[j] + (srt_off_len[j].off[i] - range_off), + srt_off_len[j].len[i], + srt_off_len[j].off[i]); + if (w_len < 0) goto over; + total_w_len += w_len; + } + if (srt_off_len[j].num > 0) { + NCI_Free(srt_off_len[j].off); + srt_off_len[j].num = 0; + } + } + batch_idx += numBufs; /* only matters for aggregators */ + } + } + + over: + if (srt_off_len) + NCI_Free(srt_off_len); + if (write_buf != NULL) + NCI_Free(write_buf); + if (recv_buf != NULL) { + for (j = 0; j < nbufs; j++) + NCI_Free(recv_buf[j]); + NCI_Free(recv_buf); + } + if (recv_count != NULL) { + NCI_Free(recv_count[0]); + NCI_Free(recv_count); + } + NCI_Free(send_size); + NCI_Free(off_list); + if (buf_view.is_contig) + NCI_Free(this_buf_idx); + if (send_buf != NULL) + NCI_Free(send_buf); + if (send_list != NULL) { + for (i = 0; i < cb_nodes; i++) + NCI_Free(send_list[i].disp); + NCI_Free(send_list); + } + if (recv_list != NULL) { + for (i = 0; i < nprocs; i++) + NCI_Free(recv_list[i].disp); + NCI_Free(recv_list); + } + +#ifdef WKL_DEBUG + /* check any pending messages to be received */ + MPI_Status probe_st; + int probe_flag; + MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, fd->comm, &probe_flag, &probe_st); + if (probe_flag) { + printf("ERROR ++++ MPI_Iprobe rank=%4d is_agg=%d: ---- cb_nodes=%d ntimes=%lld nbufs=%d\n",myrank,fd->is_agg,cb_nodes,ntimes,nbufs); + fflush(stdout); + } +#endif + return total_w_len; +} + +/* This heap-merge sort also coalesces sorted offset-length pairs whenever + * possible. + * + * Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 modified for a + * heap with smallest element at root. The recursion has been removed so that + * there are no function calls. Function calls are too expensive. + */ +static +void heap_merge(const PNCIO_Access *others_req, + const MPI_Count *count, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *srt_off, + MPI_Count *srt_len, +#else + MPI_Offset *srt_off, + int *srt_len, +#endif + const MPI_Count *start_pos, + int nprocs, + int nprocs_recv, + MPI_Count *total_elements) +{ + typedef struct { + MPI_Offset *off_list; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *len_list; +#else + int *len_list; +#endif + MPI_Count nelem; + } heap_struct; + + heap_struct *a, tmp; + int i, j, heapsize, l, r, k, smallest; + + a = (heap_struct *) NCI_Malloc((nprocs_recv + 1) * sizeof(heap_struct)); + + j = 0; + for (i = 0; i < nprocs; i++) { + if (count[i]) { + a[j].off_list = others_req[i].offsets + start_pos[i]; + a[j].len_list = others_req[i].lens + start_pos[i]; + a[j].nelem = count[i]; + j++; + } + } + +#define SWAP(x, y, tmp) { tmp = x ; x = y ; y = tmp ; } + + heapsize = nprocs_recv; + + /* Build a heap out of the first element from each list, with the smallest + * element of the heap at the root. The first for loop is to find and move + * the smallest a[*].off_list[0] to a[0]. + */ + for (i = heapsize / 2 - 1; i >= 0; i--) { + k = i; + for (;;) { + r = 2 * (k + 1); + l = r - 1; + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) + smallest = l; + else + smallest = k; + + if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list))) + smallest = r; + + if (smallest != k) { + SWAP(a[k], a[smallest], tmp); + k = smallest; + } else + break; + } + } + + /* The heap keeps the smallest element in its first element, i.e. + * a[0].off_list[0]. + */ + j = 0; + for (i = 0; i < *total_elements; i++) { + /* extract smallest element from heap, i.e. the root */ + if (j == 0 || srt_off[j - 1] + srt_len[j - 1] < *(a[0].off_list)) { + srt_off[j] = *(a[0].off_list); + srt_len[j] = *(a[0].len_list); + j++; + } else { + /* this offset-length pair can be coalesced into the previous one */ + srt_len[j - 1] = *(a[0].off_list) + *(a[0].len_list) - srt_off[j - 1]; + } + (a[0].nelem)--; + + if (a[0].nelem) { + (a[0].off_list)++; + (a[0].len_list)++; + } else { + a[0] = a[heapsize - 1]; + heapsize--; + } + + /* Heapify(a, 0, heapsize); */ + k = 0; + for (;;) { + r = 2 * (k + 1); + l = r - 1; + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) + smallest = l; + else + smallest = k; + + if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list))) + smallest = r; + + if (smallest != k) { + SWAP(a[k], a[smallest], tmp); + k = smallest; + } else + break; + } + } + NCI_Free(a); + *total_elements = j; +} + +#define CACHE_REQ(list, nelems, buf) { \ + MPI_Aint buf_addr; \ + list.len[list.count] = nelems; \ + MPI_Get_address(buf, &buf_addr); \ + list.disp[list.count] = buf_addr; \ + list.count++; \ +} + +static +int Exchange_data_recv( + PNCIO_File *fd, + const void *buf, /* user buffer */ + char *write_buf, /* OUT: internal buffer used to write + * to file */ + char **recv_buf, /* OUT: [nbufs] internal buffer used to + * receive from other processes */ + const PNCIO_View *buf_view, /* IN: flattened buffer + * offset-length pairs */ + const MPI_Count *recv_size, /* [nprocs] recv_size[i] is amount of + * this aggregator recv from rank i */ + MPI_Offset range_off, /* starting file offset of this + * aggregator's write region */ + MPI_Count range_size, /* amount of this aggregator's write + * region */ + const MPI_Count *recv_count, /* [nprocs] recv_count[i] is the number + * of offset-length pairs received from + * rank i */ + const MPI_Count *start_pos, /* [nprocs] start_pos[i] starting value + * of others_req[i].curr */ + const PNCIO_Access *others_req, /* [nprocs] others_req[i] is rank i's + * write requests fall into this + * aggregator's file domain */ + const MPI_Offset *buf_idx, /* [cb_nodes] indices to user buffer + * offsets for sending this rank's + * write data to aggregator i */ + off_len_list *srt_off_len, /* OUT: list of write offset-length + * pairs of this aggregator */ + disp_len_list *recv_list) /* OUT: displacement-length pairs of + * recv buffer */ +{ + char *buf_ptr, *contig_buf; + size_t alloc_sz; + int i, j, nprocs, myrank, nprocs_recv, hole, build_srt_off_len; + MPI_Count sum_recv; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + + /* srt_off_len contains the file offset-length pairs to be written by this + * aggregator at this round. The file region starts from range_off with + * size of range_size. + */ + + srt_off_len->num = 0; + srt_off_len->off = NULL; + sum_recv = 0; + nprocs_recv = 0; + + /* calculate receive metadata */ + j = -1; + for (i = 0; i < nprocs; i++) { + srt_off_len->num += recv_count[i]; + if (j == -1 && recv_count[i] > 0) j = i; + sum_recv += recv_size[i]; + if (recv_size[i]) + nprocs_recv++; + } + + if (nprocs_recv == 0) return NC_NOERR; + +// MPI_Count numx = srt_off_len->num; printf("nprocs_recv=%d PNCIO_DS_WR_NAGGRS_LB=%d srt_off_len->num=%lld PNCIO_DS_WR_NPAIRS_LB=%d\n",nprocs_recv,PNCIO_DS_WR_NAGGRS_LB,srt_off_len->num,PNCIO_DS_WR_NPAIRS_LB); + + /* determine whether checking holes is necessary */ + if (srt_off_len->num == 0) { + /* this process has nothing to receive and hence no hole */ + build_srt_off_len = 0; + hole = 0; + } else if (srt_off_len->num == 1) { + build_srt_off_len = 0; + hole = 0; +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) + sizeof(MPI_Count); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz); + srt_off_len->len = (MPI_Count*) (srt_off_len->off + 1); +#else + alloc_sz = sizeof(MPI_Offset) + sizeof(int); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz); + srt_off_len->len = (int*) (srt_off_len->off + 1); +#endif + srt_off_len->off[0] = others_req[j].offsets[start_pos[j]]; + srt_off_len->len[0] = others_req[j].lens[start_pos[j]]; + } else if (fd->hints->ds_write == PNCIO_HINT_ENABLE) { + /* skip building of srt_off_len and proceed to read-modify-write */ + build_srt_off_len = 0; + /* assuming there are holes */ + hole = 1; + } else if (fd->hints->ds_write == PNCIO_HINT_AUTO) { + if (DO_HEAP_MERGE(nprocs_recv, srt_off_len->num)) { + /* When the number of sorted offset-length lists or the total + * number of offset-length pairs are too large, the heap-merge sort + * below for building srt_off_len can become very expensive. Such + * sorting is also used to check holes to determine whether + * read-modify-write is necessary. + */ + build_srt_off_len = 0; + /* assuming there are holes */ + hole = 1; + } + else /* heap-merge is less expensive, proceed to build srt_off_len */ + build_srt_off_len = 1; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (build_srt_off_len) { + fd->write_counter[1]++; + fd->write_counter[2] = MAX(fd->write_counter[2], srt_off_len->num); + fd->write_counter[3] = MAX(fd->write_counter[3], nprocs_recv); + } else { + fd->write_counter[4]++; + fd->write_counter[5] = MAX(fd->write_counter[5], srt_off_len->num); + fd->write_counter[6] = MAX(fd->write_counter[6], nprocs_recv); + } +#endif + } else { /* if (fd->hints->ds_write == PNCIO_HINT_DISABLE) */ + /* User explicitly disable data sieving to skip read-modify-write. + * Whether or not there is a hole is not important. However, + * srt_off_len must be constructed to merge all others_req[] into a + * single sorted list. This step is necessary because after this + * subroutine returns, write data from all non-aggregators will be + * packed into the write_buf, with a possibility of overlaps, and + * as srt_off_len stores the coalesced offset-length pairs of + * individual non-contiguous write requests, it is used to write them + * to the file. + */ + build_srt_off_len = 1; + } + + if (build_srt_off_len) { + /* merge all the offset-length pairs from others_req[] (already sorted + * individually) into a single list of offset-length pairs. + */ +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) + sizeof(MPI_Count); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz * srt_off_len->num); + srt_off_len->len = (MPI_Count*) (srt_off_len->off + srt_off_len->num); +#else + alloc_sz = sizeof(MPI_Offset) + sizeof(int); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz * srt_off_len->num); + srt_off_len->len = (int*) (srt_off_len->off + srt_off_len->num); +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double curT = MPI_Wtime(); +#endif + heap_merge(others_req, recv_count, srt_off_len->off, srt_off_len->len, + start_pos, nprocs, nprocs_recv, &srt_off_len->num); + + /* Now, (srt_off_len->off and srt_off_len->len) are in an increasing + * order of file offsets. In addition, they are coalesced. + */ +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->write_timing[5] += MPI_Wtime() - curT; +#endif + /* whether or not there are holes */ + hole = (srt_off_len->num > 1); + } + +// printf("%s at %d: ds_write=%s build_srt_off_len=%d hole=%d skip_read=%d srt_off_len->num=%lld\n",__func__,__LINE__, (fd->hints->ds_write == PNCIO_HINT_ENABLE)?"ENABLE": (fd->hints->ds_write == PNCIO_HINT_DISABLE)?"DISABLE":"AUTO", build_srt_off_len,hole,fd->skip_read,srt_off_len->num); +// printf("%s at %d: ds_write=%s build_srt_off_len=%d hole=%d nprocs_recv=%d(PNCIO_DS_WR_NAGGRS_LB=%d) numx=%lld(PNCIO_DS_WR_NPAIRS_LB=%d)\n",__func__,__LINE__, (fd->hints->ds_write == PNCIO_HINT_ENABLE)?"ENABLE": (fd->hints->ds_write == PNCIO_HINT_DISABLE)?"DISABLE":"AUTO", build_srt_off_len,hole,nprocs_recv,PNCIO_DS_WR_NAGGRS_LB,numx,PNCIO_DS_WR_NPAIRS_LB); + + /* data sieving */ + if (fd->hints->ds_write != PNCIO_HINT_DISABLE && hole) { + if (fd->skip_read) + memset(write_buf, 0, range_size); + else { + MPI_Offset r_len; + r_len = PNCIO_ReadContig(fd, write_buf, range_size, range_off); + if (r_len < 0) return (int)r_len; + } + + /* Once read, holes have been filled and thus the number of + * offset-length pairs, srt_off_len->num, becomes one. + */ + srt_off_len->num = 1; + if (srt_off_len->off == NULL) { /* if has not been malloc-ed yet */ +#ifdef HAVE_MPI_LARGE_COUNT + alloc_sz = sizeof(MPI_Offset) + sizeof(MPI_Count); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz); + srt_off_len->len = (MPI_Count*) (srt_off_len->off + 1); +#else + alloc_sz = sizeof(MPI_Offset) + sizeof(int); + srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz); + srt_off_len->len = (int*) (srt_off_len->off + 1); +#endif + } + srt_off_len->off[0] = range_off; + srt_off_len->len[0] = range_size; + } + + /* It is possible sum_recv (sum of message sizes to be received) is larger + * than the size of collective buffer, write_buf, if writes from multiple + * remote processes overlap. Receiving messages into overlapped regions of + * the same write_buffer may cause a problem. To avoid it, we allocate a + * temporary buffer big enough to receive all messages into disjointed + * regions. Earlier in LUSTRE_Exch_and_write(), write_buf is already + * allocated with twice amount of the file stripe size, with the second + * half to be used to receive messages. If sum_recv is smaller than file + * stripe size, we can reuse that space. But if sum_recv is bigger (an + * overlap case, which is rare), we allocate a separate buffer of size + * sum_recv. + */ + sum_recv -= recv_size[myrank]; + if (sum_recv > fd->hints->striping_unit) + *recv_buf = (char *) NCI_Realloc(*recv_buf, sum_recv); + contig_buf = *recv_buf; + + /* cache displacement-length pairs of receive buffer */ + buf_ptr = contig_buf; + for (i = 0; i < nprocs; i++) { + if (recv_size[i] == 0) + continue; + if (i != myrank) { + if (recv_count[i] > 1) { + CACHE_REQ(recv_list[i], recv_size[i], buf_ptr) + buf_ptr += recv_size[i]; + } else { + /* recv_count[i] is the number of noncontiguous offset-length + * pairs describing the write requests of rank i that fall + * into this aggregator's file domain. When recv_count[i] is 1, + * there is only one such pair, meaning the receive message is + * to be stored contiguously. Such message can be received + * directly into write_buf. + */ + CACHE_REQ(recv_list[i], recv_size[i], + write_buf + others_req[i].mem_ptrs[start_pos[i]]) + } + } else if (buf_view->is_contig && recv_count[i] > 0) { + /* send/recv to/from self uses memcpy(). The case when buftype is + * not contiguous will be handled later in Exchange_data_send(). + */ + char *fromBuf = (char *) buf + buf_idx[fd->my_cb_nodes_index]; + MEMCPY_UNPACK(i, fromBuf, start_pos[i], recv_count[i], write_buf); + } + } + return NC_NOERR; +} + +static +void Exchange_data_send( + PNCIO_File *fd, + const void *buf, /* user buffer */ + char *write_buf, /* OUT: internal buffer used to write + * to file, only matter when send to + * self */ + char **send_buf_ptr, /* OUT: [cb_nodes] point to internal + * send buffer */ + PNCIO_View *buf_view, /* IN/OUT: flattened buffer + * offset-length pairs */ + const MPI_Count *send_size, /* [cb_nodes] send_size[i] is amount of + * this rank sent to aggregator i */ + MPI_Count self_count, /* No. offset-length pairs sent to self + * rank */ + MPI_Count start_pos, /* others_req[myrank].curr */ + const PNCIO_Access *others_req, /* [nprocs] only used when send to self, + * others_req[myrank] */ + const MPI_Offset *buf_idx, /* [cb_nodes] indices to user buffer + * for sending this rank's write data + * to aggregator i */ + disp_len_list *send_list) /* OUT: displacement-length pairs of + * send buffer */ +{ + int i, myrank, cb_nodes; + + *send_buf_ptr = NULL; + + MPI_Comm_rank(fd->comm, &myrank); + + cb_nodes = fd->hints->cb_nodes; +// if (myrank==0) printf("%s at %d: cb_nodes=%d\n",__func__,__LINE__, cb_nodes); + if (buf_view->is_contig) { + /* If buftype is contiguous, data can be directly sent from user buf + * at location given by buf_idx. + */ + for (i = 0; i < cb_nodes; i++) { +// if (myrank==0 && send_size[i]) printf("%s at %d: cb_nodes=%d send_size[%d]=%lld my_cb_nodes_index=%d\n",__func__,__LINE__, cb_nodes,i,send_size[i],fd->my_cb_nodes_index); + if (send_size[i] && i != fd->my_cb_nodes_index) + CACHE_REQ(send_list[i], send_size[i], (char*)buf + buf_idx[i]); + } + } else { + char **send_buf, *self_buf; + + /* total send size of this round */ + size_t send_total_size = 0; + for (i = 0; i < cb_nodes; i++) + send_total_size += send_size[i]; + + if (send_total_size == 0) return; + + /* The user buffer to be used to send in this round is not contiguous, + * allocate send_buf[], a contiguous space, copy data to send_buf, + * including ones to be sent to self, and then use send_buf to send. + */ + send_buf = (char **) NCI_Malloc(cb_nodes * sizeof(char *)); + send_buf[0] = (char *) NCI_Malloc(send_total_size); + for (i = 1; i < cb_nodes; i++) + send_buf[i] = send_buf[i - 1] + send_size[i - 1]; + + LUSTRE_Fill_send_buffer(fd, buf, buf_view, send_buf, + send_total_size, send_size, &self_buf, + send_list); + /* Send buffers must not be touched before MPI_Waitall() is completed, + * and thus send_buf will be freed in LUSTRE_Exch_and_write() + */ + + if (fd->my_cb_nodes_index >= 0 && send_size[fd->my_cb_nodes_index] > 0) { + /* contents of user buf that must be sent to self has been copied + * into send_buf[fd->my_cb_nodes_index]. Now unpack it into + * write_buf. + */ + if (self_buf == NULL) self_buf = send_buf[fd->my_cb_nodes_index]; + MEMCPY_UNPACK(myrank, self_buf, start_pos, self_count, write_buf); + } + + *send_buf_ptr = send_buf[0]; + NCI_Free(send_buf); + } +} + +static void LUSTRE_Fill_send_buffer(PNCIO_File *fd, + const void *buf, + PNCIO_View *buf_view, /* IN/OUT */ + char **send_buf, + size_t send_total_size, + const MPI_Count *send_size, + char **self_buf, + disp_len_list *send_list) +{ + /* this function is only called if buftype is not contiguous */ + int q, first_q=-1, isUserBuf=0; + MPI_Count send_size_rem=0, size, copy_size=0; + char *user_buf_ptr=NULL, *send_buf_ptr=NULL, *same_buf_ptr=NULL; + MPI_Offset off, user_buf_idx; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset len, rem_len; +#else + int len, rem_len; +#endif + +#ifdef WKL_DEBUG +int num_memcpy=0; +#endif + + *self_buf = NULL; + + /* user_buf_idx is to the index offset to buf, indicating the starting + * location to be copied. + * + * buf_view stores the offset-length pairs of the flattened user buffer + * data type. Note this stores offset-length pairs of the data type, + * and write amount can be a multiple of the data type. + * buf_view.count: the number of pairs + * buf_view.off[i]: the ith pair's byte offset to buf. Note the + * flattened offsets of user buffer type may not be sorted in an + * increasing order, unlike fileview which is required by MPI to be + * sorted in a monotonically non-decreasing order. + * buf_view.len[i]: length of the ith pair + * buf_view.idx: index to the offset-length pair currently being + * processed, incremented each round. + * buf_view.rem: amount of data in the pair that has not been copied + * over, changed each round. + */ + user_buf_idx = buf_view->off[buf_view->idx] + + buf_view->len[buf_view->idx] + - buf_view->rem; + /* in case data left to be copied from previous round */ + + /* fd->flat_file.count: the number of noncontiguous file segments this + * rank writes to. Each segment i is described by fd->flat_file.offs[i] + * and fd->flat_file.len[i]. + * fd->flat_file.idx: the index to the fd->flat_file.offs[], + * fd->flat_file.len[] that have been processed in the previous round. + * The while loop below packs write data into send buffers, send_buf[], + * based on this rank's off-len pairs in its file view, + */ + off = fd->flat_file.off[fd->flat_file.idx] + + fd->flat_file.len[fd->flat_file.idx] + - fd->flat_file.rem; + rem_len = fd->flat_file.rem; + +// int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank); + while (send_total_size > 0) { + /* this off-len request may span to more than one I/O aggregator */ +// if (rank == 0) printf("rank 0 %s at %d send_total_size=%zd rem_len=%lld\n",__func__,__LINE__,send_total_size,rem_len); + while (rem_len != 0) { + len = rem_len; + q = LUSTRE_Calc_aggregator(fd, off, &len); + /* NOTE: len will be modified by PNCIO_Calc_aggregator() to be no + * more than a file stripe unit size that aggregator "q" is + * responsible for. Note q is not the MPI rank ID, It is the array + * index to fd->hints->ranklist[]. + * + * Now len is the amount of data in ith off-len pair that should be + * sent to aggregator q. Note q can also be self. In this case, + * data is also packed into send_buf[q] or pointed to a segment of + * buf when the data to be packed is contiguous. send_buf[q] will + * later be copied to write buffer in MEMCPY_UNPACK, instead of + * calling MPI_Issend to send. + * + * send_size[q]: data amount of this rank needs to send to + * aggregator q in this round. + * + * len and send_size[q] are all always <= striping_unit + */ + +// if (rank == 0) printf("rank 0 %s at %d rem_len=%lld len=%lld first_q=%d q=%d idx=%lld\n",__func__,__LINE__,rem_len,len,first_q,q,buf_view->idx); + + if (first_q != q) { + assert(send_size_rem == 0); + first_q = q; + isUserBuf = 1; + send_size_rem = send_size[q]; + copy_size = 0; + same_buf_ptr = (char*)buf + user_buf_idx; /* no increment */ + user_buf_ptr = same_buf_ptr; /* increment after each memcpy */ + if (send_buf != NULL) + send_buf_ptr = send_buf[q]; /* increment after each memcpy */ + } + + /* copy len amount of data from buf to send_buf[q] */ + size = len; + + while (size) { + MPI_Count size_in_buf = MIN(size, buf_view->rem); + copy_size += size_in_buf; + user_buf_idx += size_in_buf; + send_size_rem -= size_in_buf; + buf_view->rem -= size_in_buf; +// if (rank == 0) printf("rank 0 %s at %d size=%lld size_in_buf=%lld copy_size=%lld rem=%ld\n",__func__,__LINE__, size, size_in_buf, copy_size,buf_view->rem); + if (buf_view->rem == 0) { /* move on to next off-len pair */ + if (! buf_view->is_contig) { + /* user buffer type is not contiguous */ + if (send_size_rem) { + /* after this copy send_buf[q] is still not full */ + isUserBuf = 0; +// if (rank == 0 && (char*)buf == (char*)user_buf_ptr) printf("rank 0 copy original buf 1 size=%lld user_buf_ptr=%p\n",copy_size,user_buf_ptr); + memcpy(send_buf_ptr, user_buf_ptr, copy_size); +user_buf_ptr += copy_size; + send_buf_ptr += copy_size; + copy_size = 0; + } else if (isUserBuf == 0) { + /* send_buf[q] is full and not using user buf, + * copy the remaining delayed data */ +// if (rank == 0 && (char*)buf == (char*)user_buf_ptr) printf("rank 0 copy original buf 2 size=%lld\n",copy_size); + memcpy(send_buf_ptr, user_buf_ptr, copy_size); +user_buf_ptr += copy_size; + } +#ifdef WKL_DEBUG +num_memcpy++; +#endif + } + /* update buf_view->idx, buf_view->rem, + * and user_buf_idx + */ + buf_view->idx++; +assert(buf_view->idx <= buf_view->count); + +if (buf_view->idx < buf_view->count) { + user_buf_idx = buf_view->off[buf_view->idx]; + buf_view->rem = buf_view->len[buf_view->idx]; + user_buf_ptr = (char*) buf + user_buf_idx; +} +else assert(size - size_in_buf == 0); + + } + else if (send_size_rem == 0 && isUserBuf == 0) { + /* buf_view->rem > 0, send_buf[q] is full, and not using + * user buf to send, copy the remaining delayed data + */ +// if (rank == 0 && (char*)buf == (char*)user_buf_ptr) printf("rank 0 copy original buf 3 size=%lld\n",copy_size); + memcpy(send_buf_ptr, user_buf_ptr, copy_size); +#ifdef WKL_DEBUG +num_memcpy++; +#endif + user_buf_ptr += copy_size; + } + size -= size_in_buf; + } + + if (send_size_rem == 0) { /* data to q is fully packed */ + first_q = -1; + + if (q != fd->my_cb_nodes_index) { /* send only if not self rank */ + if (isUserBuf) + CACHE_REQ(send_list[q], send_size[q], same_buf_ptr) + else + CACHE_REQ(send_list[q], send_size[q], send_buf[q]) + } + else if (isUserBuf) { + /* send buffer is also (part of) user's buf. Return the + * buffer pointer, so the self send data can be directly + * unpack from user buf to write buffer. + */ + *self_buf = same_buf_ptr; + } + } + /* len is the amount of data copied */ + off += len; + rem_len -= len; + fd->flat_file.rem -= len; + send_total_size -= len; + if (send_total_size == 0) break; + } + + /* done with this off-len pair, move on to the next */ + if (fd->flat_file.rem == 0) { + fd->flat_file.idx++; + fd->flat_file.rem = fd->flat_file.len[fd->flat_file.idx]; + } + off = fd->flat_file.off[fd->flat_file.idx]; + rem_len = fd->flat_file.rem; + } + +#ifdef WKL_DEBUG +if (num_memcpy> 0) printf("---- fd->flat_file.count=%lld fd->flat_file.idx=%lld buf_view->count=%lld num_memcpy=%d\n",fd->flat_file.count,fd->flat_file.idx,buf_view->count,num_memcpy); +#endif +} + diff --git a/src/drivers/pncio/pncio_lustre_wrstr.c b/src/drivers/pncio/pncio_lustre_wrstr.c new file mode 100644 index 000000000..341fab400 --- /dev/null +++ b/src/drivers/pncio/pncio_lustre_wrstr.c @@ -0,0 +1,363 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + + +#define BUFFERED_WRITE { \ + if (req_off >= writebuf_off + writebuf_len) { \ + if (writebuf_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, \ + writebuf_off); \ + if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (w_len < 0) { \ + NCI_Free(writebuf); \ + return w_len; \ + } \ + total_w_len += w_len; \ + writebuf_off = req_off; \ + } \ + writebuf_off = req_off; \ + /* stripe_size alignment */ \ + writebuf_len = MIN(end_offset - writebuf_off + 1, \ + (writebuf_off / stripe_size + 1) * stripe_size \ + - writebuf_off); \ + if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (r_len < 0) { \ + NCI_Free(writebuf); \ + return r_len; \ + } \ + } \ + write_sz = (MIN(req_len, writebuf_off + writebuf_len - req_off)); \ + memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, \ + write_sz); \ + while (write_sz != req_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (w_len < 0) { \ + NCI_Free(writebuf); \ + return w_len; \ + } \ + total_w_len += w_len; \ + req_len -= write_sz; \ + userbuf_off += write_sz; \ + writebuf_off += writebuf_len; \ + /* stripe_size alignment */ \ + writebuf_len = MIN(end_offset - writebuf_off + 1, \ + (writebuf_off / stripe_size + 1) * stripe_size \ + - writebuf_off); \ + if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (r_len < 0) { \ + NCI_Free(writebuf); \ + return r_len; \ + } \ + write_sz = MIN(req_len, writebuf_len); \ + memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \ + } \ +} + +/* this macro is used when filetype is contig and buftype is not contig. + * it does not do a read-modify-write and does not lock + */ +#define BUFFERED_WRITE_WITHOUT_READ { \ + if (req_off >= writebuf_off + writebuf_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (w_len < 0) { \ + NCI_Free(writebuf); \ + return w_len; \ + } \ + total_w_len += w_len; \ + writebuf_off = req_off; \ + /* stripe_size alignment */ \ + writebuf_len = MIN(end_offset - writebuf_off + 1, \ + (writebuf_off / stripe_size + 1) * stripe_size \ + - writebuf_off); \ + } \ + write_sz = MIN(req_len, writebuf_off + writebuf_len - req_off); \ + memcpy(writebuf + req_off - writebuf_off, \ + (char *)buf + userbuf_off, write_sz); \ + while (write_sz != req_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (w_len < 0) { \ + NCI_Free(writebuf); \ + return w_len; \ + } \ + total_w_len += w_len; \ + req_len -= write_sz; \ + userbuf_off += write_sz; \ + writebuf_off += writebuf_len; \ + /* stripe_size alignment */ \ + writebuf_len = MIN(end_offset - writebuf_off + 1, \ + (writebuf_off / stripe_size + 1) * stripe_size \ + - writebuf_off); \ + write_sz = MIN(req_len, writebuf_len); \ + memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \ + } \ +} + +MPI_Offset PNCIO_LUSTRE_WriteStrided(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + char *writebuf; + int i, j, k, st_index=0, stripe_size; + /* offset is in units of etype relative to the filetype. */ + MPI_Offset i_offset, sum, num, size, abs_off_in_filetype=0, off, disp; + MPI_Offset userbuf_off, req_off, end_offset=0, writebuf_off, start_off; + MPI_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size; + MPI_Offset req_len, r_len, w_len, total_w_len=0; + MPI_Count bufsize, writebuf_len, write_sz; + + /* The case of both buftype and filetype being contiguous has gone to + * PNCIO_WriteContig(). + */ + +// printf("%s at %d:\n",__func__,__LINE__); + + if (fd->hints->ds_write == PNCIO_HINT_DISABLE) { + /* if user has disabled data sieving on writes, use naive + * approach instead. + */ + return PNCIO_GEN_WriteStrided_naive(fd, buf, buf_view, offset); + } + + +/* PnetCDF always sets these 3 conditions */ +assert(fd->filetype == MPI_BYTE); +assert(fd->flat_file.size == buf_view.size); +if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */ + + bufsize = buf_view.size; + + /* get striping info */ + stripe_size = fd->hints->striping_unit; + + if (!buf_view.is_contig && fd->flat_file.is_contig) { + /* noncontiguous in write buffer, contiguous in file. */ + + off = fd->disp + offset; + if (fd->flat_file.count > 0) off += fd->flat_file.off[0]; + + start_off = off; + end_offset = start_off + bufsize - 1; + + /* write stripe size buffer each time */ + writebuf = (char *) NCI_Malloc(MIN(bufsize, stripe_size)); + writebuf_off = 0; + writebuf_len = 0; + + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed + */ + if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize); + + for (i = 0; i < buf_view.count; i++) { + userbuf_off = buf_view.off[i]; + req_off = off; + req_len = buf_view.len[i]; + BUFFERED_WRITE_WITHOUT_READ; + off += buf_view.len[i]; + } + + /* write the buffer out the last round */ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); + + if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, bufsize); + + NCI_Free(writebuf); + + if (w_len < 0) return w_len; + total_w_len += w_len; + + } else { /* contiguous buffer and non-contiguous in file */ + disp = fd->disp; +/* for non-contiguous in file, PnetCDF always uses disp == 0 */ +assert(disp == 0); + + /* find the starting index in fd->flat_file offset-length pairs */ + sum = 0; + for (i = 0; i < fd->flat_file.count; i++) { + sum += fd->flat_file.len[i]; + if (sum > offset) { + st_index = i; + fwr_size = sum - offset; + abs_off_in_filetype = fd->flat_file.off[i] + + offset - (sum - fd->flat_file.len[i]); + break; + } + } + + /* abs. offset in bytes in the file */ + offset = disp + abs_off_in_filetype; + + start_off = offset; + + /* Write request is within single flat_file contig block. This could + * happen, for example, with subarray types that are actually fairly + * contiguous. + */ + if (buf_view.is_contig && bufsize <= fwr_size) { + req_off = start_off; + req_len = bufsize; + end_offset = start_off + bufsize - 1; + writebuf = (char *) NCI_Malloc(MIN(bufsize, stripe_size)); + memset(writebuf, -1, (size_t)MIN(bufsize, stripe_size)); + writebuf_off = 0; + writebuf_len = 0; + userbuf_off = 0; + BUFFERED_WRITE_WITHOUT_READ; + + /* write the buffer out the last round */ + if (fd->hints->ds_write != PNCIO_HINT_DISABLE) + PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); + + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); + if (w_len > 0) total_w_len += w_len; + + if (fd->hints->ds_write != PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); + + NCI_Free(writebuf); + + return total_w_len; + } + + /* Calculate end_offset, the last byte-offset that will be accessed. + * e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */ + + st_fwr_size = fwr_size; + j = st_index; + i_offset = fwr_size = MIN(st_fwr_size, bufsize); + end_offset = offset + fwr_size - 1; + while (i_offset < bufsize) { + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + fwr_size = MIN(fd->flat_file.len[j], bufsize - i_offset); + i_offset += fwr_size; + end_offset = off + fwr_size - 1; + } + + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed */ + if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + writebuf_off = 0; + writebuf_len = 0; + writebuf = (char *) NCI_Malloc(stripe_size); + memset(writebuf, -1, stripe_size); + + if (buf_view.is_contig && !fd->flat_file.is_contig) { + /* contiguous in memory, noncontiguous in file should be the most + * common case. + */ + i_offset = 0; + j = st_index; + off = offset; + fwr_size = MIN(st_fwr_size, bufsize); + while (i_offset < bufsize) { + if (fwr_size) { + req_off = off; + req_len = fwr_size; + userbuf_off = i_offset; + BUFFERED_WRITE; + } + i_offset += fwr_size; + if (i_offset >= bufsize) break; + + if (off + fwr_size < disp + fd->flat_file.off[j] + + fd->flat_file.len[j]) + off += fwr_size; + /* no more I/O needed. off is incremented by fwr_size. */ + else { + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + fwr_size = MIN(fd->flat_file.len[j], + bufsize - i_offset); + } + } + } else { + /* noncontiguous in memory as well as in file */ + k = num = 0; + i_offset = buf_view.off[0]; + j = st_index; + off = offset; + fwr_size = st_fwr_size; + bwr_size = buf_view.len[0]; + + while (num < bufsize) { + size = MIN(fwr_size, bwr_size); + if (size) { + req_off = off; + req_len = size; + userbuf_off = i_offset; + BUFFERED_WRITE; + } + num += size; + if (num >= bufsize) break; + + new_fwr_size = fwr_size; + new_bwr_size = bwr_size; + + if (size == fwr_size) { + /* reached end of contiguous block in file */ + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + + new_fwr_size = fd->flat_file.len[j]; + if (size != bwr_size) { + i_offset += size; + new_bwr_size -= size; + } + } + + if (size == bwr_size) { + /* reached end of contiguous block in memory */ + k++; +assert(k < buf_view.count); + i_offset = buf_view.off[k]; + new_bwr_size = buf_view.len[k]; + if (size != fwr_size) { + off += size; + new_fwr_size -= size; + } + } + fwr_size = new_fwr_size; + bwr_size = new_bwr_size; + } + } + + /* write the buffer out the last round */ + if (writebuf_len) { + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); + if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); + if (w_len < 0) return w_len; + total_w_len += w_len; + } + if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + NCI_Free(writebuf); + } + + return buf_view.size; +} diff --git a/src/drivers/pncio/pncio_open.c b/src/drivers/pncio/pncio_open.c new file mode 100644 index 000000000..38981b2c8 --- /dev/null +++ b/src/drivers/pncio/pncio_open.c @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* open(), O_CREAT */ +#include /* open(), umask() */ +#include /* umask() */ + +#include +#include + +#include + +#include "pncio.h" + +/*----< GEN_set_cb_node_list() >---------------------------------------------*/ +/* Construct the list of I/O aggregators. It sets the followings. + * fd->hints->ranklist[]. + * fd->hints->cb_nodes and set file info for hint cb_nodes. + * fd->is_agg: indicating whether this rank is an I/O aggregator + * fd->my_cb_nodes_index: index into fd->hints->ranklist[]. -1 if N/A + */ +static +int GEN_set_cb_node_list(PNCIO_File *fd) +{ + int i, j, k, nprocs, rank, *nprocs_per_node, **ranks_per_node; + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &rank); + + if (fd->hints->cb_nodes == 0) + /* If hint cb_nodes is not set by user, select one rank per node to be + * an I/O aggregator + */ + fd->hints->cb_nodes = fd->num_nodes; + else if (fd->hints->cb_nodes > nprocs) + /* cb_nodes must be <= nprocs */ + fd->hints->cb_nodes = nprocs; + + fd->hints->ranklist = (int *) NCI_Malloc(sizeof(int) * fd->hints->cb_nodes); + if (fd->hints->ranklist == NULL) + return NC_ENOMEM; + + /* number of MPI processes running on each node */ + nprocs_per_node = (int *) NCI_Calloc(fd->num_nodes, sizeof(int)); + + for (i=0; inode_ids[i]]++; + + /* construct rank IDs of MPI processes running on each node */ + ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->num_nodes); + ranks_per_node[0] = (int *) NCI_Malloc(sizeof(int) * nprocs); + for (i=1; inum_nodes; i++) + ranks_per_node[i] = ranks_per_node[i - 1] + nprocs_per_node[i - 1]; + + for (i=0; inum_nodes; i++) nprocs_per_node[i] = 0; + + /* Populate ranks_per_node[], list of MPI ranks running on each node. + * Populate nprocs_per_node[], number of MPI processes on each node. + */ + for (i=0; inode_ids[i]; + ranks_per_node[k][nprocs_per_node[k]] = i; + nprocs_per_node[k]++; + } + + /* select process ranks from nodes in a round-robin fashion to be I/O + * aggregators + */ + k = j = 0; + for (i=0; ihints->cb_nodes; i++) { + if (j >= nprocs_per_node[k]) { /* if run out of ranks in this node k */ + k++; + if (k == fd->num_nodes) { /* round-robin to first node */ + k = 0; + j++; + } + } + /* select jth rank of node k as an I/O aggregator */ + fd->hints->ranklist[i] = ranks_per_node[k++][j]; + if (rank == fd->hints->ranklist[i]) { + fd->is_agg = 1; + fd->my_cb_nodes_index = i; + } + if (k == fd->num_nodes) { /* round-robin to first node */ + k = 0; + j++; + } + } + NCI_Free(ranks_per_node[0]); + NCI_Free(ranks_per_node); + NCI_Free(nprocs_per_node); + + return 0; +} + +/*----< GEN_create() >-------------------------------------------------------*/ +/* 1. root creates the file + * 2. root sets and obtains striping info + * 3. root broadcasts striping info + * 4. non-root processes receive striping info from root + * 5. non-root processes opens the fie + */ +static int +GEN_create(PNCIO_File *fd, + int mpi_io_mode) +{ + int err=NC_NOERR, rank, amode, perm, old_mask; + int stripin_info[4] = {-1, -1, -1, -1}; + + MPI_Comm_rank(fd->comm, &rank); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +if (rank == 0) { printf("\nxxxx %s at %d: ---- %s\n",__func__,__LINE__,fd->filename); fflush(stdout);} +#endif + + amode = O_CREAT; + if (mpi_io_mode & MPI_MODE_RDWR) amode |= O_RDWR; + + old_mask = umask(022); + umask(old_mask); + perm = old_mask ^ PNCIO_PERM; + + /* root process creates the file first, followed by all processes open the + * file. + */ + if (rank > 0) goto err_out; + + fd->fd_sys = open(fd->filename, amode, perm); + if (fd->fd_sys == -1) { + fprintf(stderr,"%s line %d: rank %d fails to create file %s (%s)\n", + __func__,__LINE__, rank, fd->filename, strerror(errno)); + err = ncmpii_error_posix2nc("open"); + goto err_out; + } + +err_out: + MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm); + + fd->hints->striping_unit = stripin_info[0]; + fd->hints->striping_factor = stripin_info[1]; + fd->hints->start_iodevice = stripin_info[2]; + + if (rank > 0) { /* non-root processes */ + fd->fd_sys = open(fd->filename, O_RDWR, perm); + if (fd->fd_sys == -1) { + fprintf(stderr,"%s line %d: rank %d failure to open file %s (%s)\n", + __func__,__LINE__, rank, fd->filename, strerror(errno)); + return ncmpii_error_posix2nc("ioctl"); + } + } + + /* construct cb_nodes rank list */ + GEN_set_cb_node_list(fd); + MPI_Info_set(fd->info, "romio_filesystem_type", "UFS:"); + + return err; +} + +/*----< GEN_open() >---------------------------------------------------------*/ +/* 1. all processes open the file. + * 2. root obtains striping info and broadcasts to all others + */ +static int +GEN_open(PNCIO_File *fd) +{ + int err=NC_NOERR, rank, perm, old_mask, omode; + int stripin_info[4] = {1048576, -1, -1, -1}; + + MPI_Comm_rank(fd->comm, &rank); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +if (rank == 0) { printf("\nxxxx %s at %d: ---- %s\n",__func__,__LINE__,fd->filename); fflush(stdout);} +#endif + + old_mask = umask(022); + umask(old_mask); + perm = old_mask ^ PNCIO_PERM; + + if (fIsSet(fd->access_mode, MPI_MODE_RDWR)) + omode = O_RDWR; + else + omode = O_RDONLY; + + /* All processes open the file. */ + fd->fd_sys = open(fd->filename, omode, perm); + if (fd->fd_sys == -1) { + fprintf(stderr, "%s line %d: rank %d failure to open file %s (%s)\n", + __func__,__LINE__, rank, fd->filename, strerror(errno)); + err = ncmpii_error_posix2nc("open"); + goto err_out; + } + + /* Only root obtains the striping information and bcast to all other + * processes. + */ + if (rank == 0) { + /* Get the underlying file system block size as file striping_unit */ + struct stat statbuf; + err = fstat(fd->fd_sys, &statbuf); + if (err >= 0) + /* file system block size usually < MAX_INT */ + stripin_info[0] = (int)statbuf.st_blksize; + } + +err_out: + MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm); + fd->hints->striping_unit = stripin_info[0]; + fd->hints->striping_factor = stripin_info[1]; + fd->hints->start_iodevice = stripin_info[2]; + + /* construct cb_nodes rank list */ + GEN_set_cb_node_list(fd); + MPI_Info_set(fd->info, "romio_filesystem_type", "UFS:"); + + return err; +} + +/*----< PNCIO_File_open() >---------------------------------------------------*/ +int PNCIO_File_open(MPI_Comm comm, + const char *filename, + int amode, + MPI_Info info, + PNCIO_File *fd) +{ + /* Before reaching to this subroutine, PNCIO_FileSysType() should have been + * called to check the file system type. + */ + char value[MPI_MAX_INFO_VAL + 1], int_str[16]; + int i, err, min_err; + + fd->comm = comm; + fd->filename = filename; /* without file system type name prefix */ + fd->atomicity = 0; + fd->filetype = MPI_BYTE; + fd->is_open = 0; + fd->access_mode = amode; + fd->io_buf = NULL; /* collective buffer used by aggregators only */ + + fd->flat_file.count = 0; /* flattend fileview in offset-length pairs */ + fd->flat_file.size = -1; + fd->flat_file.is_contig = 1; + fd->flat_file.off = NULL; + fd->flat_file.len = NULL; + + /* create and initialize info object */ + fd->hints = (PNCIO_Hints*) NCI_Calloc(1, sizeof(PNCIO_Hints)); + if (info == MPI_INFO_NULL) + MPI_Info_create(&fd->info); + else + MPI_Info_dup(info, &fd->info); + + err = PNCIO_File_SetInfo(fd, fd->info); + if (err != NC_NOERR) + return err; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + for (i=0; iwrite_timing[i] = fd->read_timing[i] = 0; + fd->write_counter[i] = fd->read_counter[i] = 0; + } +#endif + + assert(fd->file_system != PNCIO_FSTYPE_MPIIO); + + if (fd->file_system == PNCIO_LUSTRE) { + if (amode & MPI_MODE_CREATE) + err = PNCIO_Lustre_create(fd, amode); + else + err = PNCIO_Lustre_open(fd); + } + else { + if (amode & MPI_MODE_CREATE) + err = GEN_create(fd, amode); + else + err = GEN_open(fd); + } + if (err != NC_NOERR) goto err_out; + + /* TODO: when hint no_indep_rw hint is set to true, only aggregators open + * the file */ + fd->is_open = 1; + + /* set file striping hints */ + snprintf(int_str, 16, "%d", fd->hints->striping_unit); + MPI_Info_set(fd->info, "striping_unit", int_str); + + snprintf(int_str, 16, "%d", fd->hints->striping_factor); + MPI_Info_set(fd->info, "striping_factor", int_str); + + snprintf(int_str, 16, "%d", fd->hints->start_iodevice); + MPI_Info_set(fd->info, "start_iodevice", int_str); + + /* set file striping hints */ + snprintf(int_str, 16, "%d", fd->hints->cb_nodes); + MPI_Info_set(fd->info, "cb_nodes", int_str); + + /* add hint "cb_node_list", list of aggregators' rank IDs */ + snprintf(value, 16, "%d", fd->hints->ranklist[0]); + for (i=1; ihints->cb_nodes; i++) { + snprintf(int_str, 16, " %d", fd->hints->ranklist[i]); + if (strlen(value) + strlen(int_str) >= MPI_MAX_INFO_VAL-5) { + strcat(value, " ..."); + break; + } + strcat(value, int_str); + } + MPI_Info_set(fd->info, "cb_node_list", value); + + /* collective buffer size must be at least file striping size */ + if (fd->hints->cb_buffer_size < fd->hints->striping_unit) { + fd->hints->cb_buffer_size = fd->hints->striping_unit; + snprintf(int_str, 16, " %d", fd->hints->cb_buffer_size); + MPI_Info_set(fd->info, "cb_buffer_size", int_str); + } + + /* collective buffer is used only by I/O aggregators only */ + if (fd->is_agg) { + fd->io_buf = NCI_Calloc(1, fd->hints->cb_buffer_size); + if (fd->io_buf == NULL) + return NC_ENOMEM; + } + +err_out: + MPI_Allreduce(&err, &min_err, 1, MPI_INT, MPI_MIN, comm); + /* All NC errors are < 0 */ + if (min_err < 0) { + if (err == 0) /* close file if opened successfully */ + close(fd->fd_sys); + NCI_Free(fd->hints); + if (fd->info != MPI_INFO_NULL) + MPI_Info_free(&(fd->info)); + if (fd->io_buf != NULL) + NCI_Free(fd->io_buf); + } + return err; +} + diff --git a/src/drivers/pncio/pncio_read.c b/src/drivers/pncio/pncio_read.c new file mode 100644 index 000000000..a7594ed2f --- /dev/null +++ b/src/drivers/pncio/pncio_read.c @@ -0,0 +1,140 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include /* pread() */ + +#include + +#include "pncio.h" + +/*----< PNCIO_ReadContig() >--------------------------------------------------*/ +MPI_Offset PNCIO_ReadContig(PNCIO_File *fd, + void *buf, + MPI_Offset r_size, + MPI_Offset offset) +{ + ssize_t err = 0; + size_t r_count; + MPI_Offset bytes_xfered = 0; + char *p; + +// printf("%s at %d: %s pread offset=%lld r_size=%lld\n",__func__,__LINE__,fd->filename,offset,r_size); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double timing = MPI_Wtime(); +#endif + p = (char *) buf; + while (bytes_xfered < r_size) { + r_count = r_size - bytes_xfered; + err = pread(fd->fd_sys, p, r_count, offset + bytes_xfered); + if (err == -1) + goto ioerr; + if (err == 0) + break; + bytes_xfered += err; + p += err; + } +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->read_timing[2] += MPI_Wtime() - timing; +#endif + +ioerr: + if (err == -1) + bytes_xfered = ncmpii_error_posix2nc("pread"); + +/* +if (offset > 0) {unsigned long long wkl[4]; + memcpy(wkl, buf, sizeof(unsigned long long) * 4); + ncmpii_in_swapn(wkl, 4, 8); + printf("%s at %d: %s pread offset=%lld r_size=%lld wkl=%llu %lld %lld %lld\n",__func__,__LINE__,fd->filename,offset,r_size,wkl[0],wkl[1],wkl[2],wkl[3]); +} +*/ + + return bytes_xfered; +} + +/*----< file_read() >--------------------------------------------------------*/ +/* This is an independent call. */ +static +MPI_Offset file_read(PNCIO_File *fd, + MPI_Offset offset, /* relative to fileview */ + void *buf, + PNCIO_View buf_view) +{ + MPI_Offset r_len=0; + +// printf("%s at %d: offset=%lld buf_view size=%lld\n",__func__,__LINE__, offset,buf_view.size); + +assert(fd->filetype == MPI_BYTE); +if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */ + + if (buf_view.size == 0) /* zero-sized request */ + return NC_NOERR; + + if (buf_view.is_contig && fd->flat_file.is_contig) { + if (fd->flat_file.count > 0) offset += fd->flat_file.off[0]; + r_len = PNCIO_ReadContig(fd, buf, buf_view.size, offset); + } + else + r_len = PNCIO_GEN_ReadStrided(fd, buf, buf_view, offset); + + return r_len; +} + +/*----< PNCIO_File_read_at() >------------------------------------------------*/ +/* This is an independent call. + * offset is a position in the file relative to the current view, expressed as + * a count of etypes. + */ +MPI_Offset PNCIO_File_read_at(PNCIO_File *fh, + MPI_Offset offset, + void *buf, + PNCIO_View buf_view) +{ + assert(fh != NULL); + + if (buf_view.size == 0) return NC_NOERR; + + if (buf_view.size < 0) return NC_ENEGATIVECNT; + + /* PnetCDF has only 2 modes: read-only and read-write */ + // if (fh->access_mode & MPI_MODE_RDONLY) return NC_EPERM; + + return file_read(fh, offset, buf, buf_view); +} + +/*----< PNCIO_File_read_at_all() >--------------------------------------------*/ +/* This is a collective call. + * offset is a position in the file relative to the current view, expressed as + * a count of etypes. + */ +MPI_Offset PNCIO_File_read_at_all(PNCIO_File *fh, + MPI_Offset offset, + void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR; + MPI_Offset r_len; + + assert(fh != NULL); + + if (buf_view.size < 0) err = NC_ENEGATIVECNT; + + /* PnetCDF has only 2 modes: read-only and read-write */ + // if (fh->access_mode & MPI_MODE_RDONLY && st == NC_NOERR) st = NC_EPERM; + + r_len = PNCIO_GEN_ReadStridedColl(fh, buf, buf_view, offset); + + return (err == NC_NOERR) ? r_len : err; +} + diff --git a/src/drivers/pncio/pncio_read_coll.c b/src/drivers/pncio/pncio_read_coll.c new file mode 100644 index 000000000..78af29b48 --- /dev/null +++ b/src/drivers/pncio/pncio_read_coll.c @@ -0,0 +1,791 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include /* type bool */ + +#include + +/* prototypes of functions used for collective reads only. */ +static +MPI_Offset Read_and_exch(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, int nprocs, + int myrank, PNCIO_Access *others_req, + MPI_Offset + min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + MPI_Aint * buf_idx); + +static void R_Exchange_data(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, + MPI_Count * send_size, MPI_Count * recv_size, + MPI_Count * count, MPI_Count * start_pos, + MPI_Count * partial_send, + MPI_Count * recd_from_proc, int nprocs, + int myrank, + MPI_Offset min_st_offset, + MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + PNCIO_Access * others_req, + int iter, MPI_Aint * buf_idx, + MPI_Aint * actual_recved_bytes); +static void Fill_user_buffer(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, char **recv_buf, + MPI_Count * recv_size, + MPI_Count * recd_from_proc, int nprocs, + MPI_Offset min_st_offset, + MPI_Offset fd_size, MPI_Offset * fd_start, + MPI_Offset * fd_end); + +MPI_Offset PNCIO_GEN_ReadStridedColl(PNCIO_File *fd, + void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ +/* Uses a generalized version of the extended two-phase method described + in "An Extended Two-Phase Method for Accessing Sections of + Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary, + Scientific Programming, (5)4:301--317, Winter 1996. + http://www.mcs.anl.gov/home/thakur/ext2ph.ps */ + + PNCIO_Access *my_req; + /* array of nprocs structures, one for each other process in + * whose file domain this process's request lies */ + + PNCIO_Access *others_req; + /* array of nprocs structures, one for each other process + * whose request lies in this process's file domain. */ + + int nprocs, nprocs_for_coll, myrank; + int interleave_count = 0; + MPI_Count *count_my_req_per_proc, count_my_req_procs; + MPI_Count *count_others_req_per_proc, count_others_req_procs; + MPI_Offset start_offset, end_offset, fd_size, min_st_offset; + MPI_Offset *st_offsets = NULL, *fd_start = NULL, + *fd_end = NULL, *end_offsets = NULL; + MPI_Aint *buf_idx = NULL; + MPI_Offset r_len, total_r_len=0; + +// printf("%s at %d:\n",__func__,__LINE__); + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +double curT = MPI_Wtime(); +#endif + + /* number of aggregators, cb_nodes, is stored in the hints */ + nprocs_for_coll = fd->hints->cb_nodes; + + /* only check for interleaving if cb_read isn't disabled */ + if (fd->hints->cb_read != PNCIO_HINT_DISABLE) { + /* For this process's request, calculate the file start and end + * offsets. Note: end_offset points to the last byte-offset that will + * be accessed, e.g., if start_offset=0 and 100 bytes to be read, + * end_offset=99 + */ + if (fd->flat_file.size == 0) { + start_offset = 0; + end_offset = -1; + } + else if (fd->flat_file.count > 0) { + start_offset = offset + fd->flat_file.off[0]; + end_offset = fd->flat_file.off[fd->flat_file.count-1] + + fd->flat_file.len[fd->flat_file.count-1] - 1; + } + else { + start_offset = offset; + end_offset = offset + fd->flat_file.size - 1; + } + + /* each process communicates its start and end offsets to other + * processes. The result is an array each of start and end offsets + * stored in order of process rank. */ + st_offsets = (MPI_Offset *) NCI_Malloc(nprocs * 2 * sizeof(MPI_Offset)); + end_offsets = st_offsets + nprocs; + + MPI_Allgather(&start_offset, 1, MPI_OFFSET, st_offsets, 1, MPI_OFFSET, + fd->comm); + MPI_Allgather(&end_offset, 1, MPI_OFFSET, end_offsets, 1, MPI_OFFSET, + fd->comm); + + /* Are the accesses of different processes interleaved? Below is a + * rudimentary check for interleaving, but should suffice for the + * moment. */ + for (int i = 1; i < nprocs; i++) + if ((st_offsets[i] < end_offsets[i - 1]) && + (st_offsets[i] <= end_offsets[i])) + interleave_count++; + } + + if (fd->hints->cb_read == PNCIO_HINT_DISABLE + || (!interleave_count && (fd->hints->cb_read == PNCIO_HINT_AUTO))) { + /* switch to independent read */ + + if (st_offsets != NULL) NCI_Free(st_offsets); + + if (buf_view.size == 0) return 0; + +/* PnetCDF always sets this condition, i.e. when fileview is non-contiguous, offset in this call is always 0. */ +if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */ + + if (buf_view.is_contig && fd->flat_file.is_contig) { + if (fd->flat_file.count > 0) offset += fd->flat_file.off[0]; + return PNCIO_ReadContig(fd, buf, buf_view.size, offset); + } + else + return PNCIO_GEN_ReadStrided(fd, buf, buf_view, offset); + } + + /* We're going to perform aggregation of I/O. Here we call + * PNCIO_Calc_file_domains() to determine what processes will handle I/O + * to what regions. We pass nprocs_for_coll into this function; it is + * used to determine how many processes will perform I/O, which is also + * the number of regions into which the range of bytes must be divided. + * These regions are called "file domains", or FDs. + * + * When this function returns, fd_start, fd_end, fd_size, and + * min_st_offset will be filled in. fd_start holds the starting byte + * location for each file domain. fd_end holds the ending byte location. + * min_st_offset holds the minimum byte location that will be accessed. + * + * Both fd_start[] and fd_end[] are indexed by an aggregator number; this + * needs to be mapped to an actual rank in the communicator later. + * + */ + PNCIO_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, + &min_st_offset, &fd_start, &fd_end, &fd_size, + fd->hints->striping_unit); + + /* calculate where the portions of the access requests of this process + * are located in terms of the file domains. this could be on the same + * process or on other processes. this function fills in: + * count_my_req_procs - number of processes (including this one) for which + * this process has requests in their file domain + * count_my_req_per_proc - count of requests for each process, indexed + * by rank of the process + * my_req[] - array of data structures describing the requests to be + * performed by each process (including self). indexed by rank. + * buf_idx[] - array of locations into which data can be directly moved; + * this is only valid for contiguous buffer case + */ + PNCIO_Calc_my_req(fd, min_st_offset, fd_start, fd_end, fd_size, nprocs, + &count_my_req_procs, &count_my_req_per_proc, &my_req, + &buf_idx); + + /* perform a collective communication in order to distribute the + * data calculated above. fills in the following: + * count_others_req_procs - number of processes (including this + * one) which have requests in this process's file domain. + * count_others_req_per_proc[] - number of separate contiguous + * requests from proc i lie in this process's file domain. + */ + PNCIO_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, + my_req, nprocs, myrank, &count_others_req_procs, + &count_others_req_per_proc, &others_req); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->read_timing[1] += MPI_Wtime() - curT; +#endif + + /* read data in sizes of no more than collective buffer size, + * communicate, and fill user buf. + */ + r_len = Read_and_exch(fd, buf, buf_view, nprocs, myrank, others_req, + min_st_offset, fd_size, fd_start, fd_end, buf_idx); + if (r_len > 0) total_r_len += r_len; + + /* free all memory allocated for collective I/O */ + PNCIO_Free_my_req(count_my_req_per_proc, my_req, buf_idx); + PNCIO_Free_others_req(count_others_req_per_proc, others_req); + + NCI_Free(st_offsets); + NCI_Free(fd_start); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->read_timing[0] += MPI_Wtime() - curT; +#endif + + return (r_len < 0) ? r_len : total_r_len; +} + +static +MPI_Offset Read_and_exch(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, int nprocs, + int myrank, PNCIO_Access *others_req, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + MPI_Aint * buf_idx) +{ +/* Read in sizes of no more than coll_bufsize, an info parameter. + Send data to appropriate processes. + Place recd. data in user buf. + The idea is to reduce the amount of extra memory required for + collective I/O. If all data were read all at once, which is much + easier, it would require temp space more than the size of user_buf, + which is often unacceptable. For example, to read a distributed + array from a file, where each local array is 8Mbytes, requiring + at least another 8Mbytes of temp space is unacceptable. */ + + int i, m, ntimes, max_ntimes; + MPI_Offset st_loc = -1, end_loc = -1, off, done, real_off; + char *read_buf = NULL, *tmp_buf; + MPI_Count *curr_offlen_ptr, *count, *send_size, *recv_size; + MPI_Count *partial_send, *recd_from_proc, *start_pos; + /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */ + MPI_Offset real_size, size, for_curr_iter, for_next_iter; + int rank; + MPI_Aint coll_bufsize; + MPI_Aint actual_recved_bytes = 0; + MPI_Offset r_len; + +/* calculate the number of reads of size coll_bufsize + to be done by each process and the max among all processes. + That gives the no. of communication phases as well. + coll_bufsize is obtained from the hints object. */ + + coll_bufsize = fd->hints->cb_buffer_size; + + /* grab some initial values for st_loc and end_loc */ + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + st_loc = others_req[i].offsets[0]; + end_loc = others_req[i].offsets[0]; + break; + } + } + + /* now find the real values */ + for (i = 0; i < nprocs; i++) + for (MPI_Count j = 0; j < others_req[i].count; j++) { + st_loc = MIN(st_loc, others_req[i].offsets[j]); + end_loc = MAX(end_loc, (others_req[i].offsets[j] + + others_req[i].lens[j] - 1)); + } + + /* calculate ntimes, the number of times this process must perform I/O + * operations in order to complete all the requests it has received. + * the need for multiple I/O operations comes from the restriction that + * we only use coll_bufsize bytes of memory for internal buffering. + */ + if ((st_loc == -1) && (end_loc == -1)) { + /* this process does no I/O. */ + ntimes = 0; + } else { + /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize) */ + ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize); + } + + MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->read_counter[0] = MAX(fd->read_counter[0], max_ntimes); +#endif + + read_buf = fd->io_buf; /* Allocated at open time */ + + curr_offlen_ptr = NCI_Calloc(nprocs * 7, sizeof(*curr_offlen_ptr)); + /* its use is explained below. calloc initializes to 0. */ + + count = curr_offlen_ptr + nprocs; + /* to store count of how many off-len pairs per proc are satisfied + * in an iteration. */ + + partial_send = count + nprocs; + /* if only a portion of the last off-len pair is sent to a process + * in a particular iteration, the length sent is stored here. + * calloc initializes to 0. */ + + send_size = partial_send + nprocs; + /* total size of data to be sent to each proc. in an iteration */ + + recv_size = send_size + nprocs; + /* total size of data to be recd. from each proc. in an iteration. + * Of size nprocs so that I can use MPI_Alltoall later. */ + + recd_from_proc = recv_size + nprocs; + /* amount of data recd. so far from each proc. Used in Fill_user_buffer. + * initialized to 0 here. */ + + start_pos = recd_from_proc + nprocs; + /* used to store the starting value of curr_offlen_ptr[i] in + * this iteration */ + + done = 0; + off = st_loc; + for_curr_iter = for_next_iter = 0; + + MPI_Comm_rank(fd->comm, &rank); + + for (m = 0; m < ntimes; m++) { + /* read buf of size coll_bufsize (or less) */ + /* go through all others_req and check if any are satisfied + * by the current read */ + + /* since MPI guarantees that displacements in filetypes are in + * monotonically nondecreasing order, I can maintain a pointer + * (curr_offlen_ptr) to + * current off-len pair for each process in others_req and scan + * further only from there. There is still a problem of filetypes + * such as: (1, 2, 3 are not process nos. They are just numbers for + * three chunks of data, specified by a filetype.) + * + * 1 -------!-- + * 2 -----!---- + * 3 --!----- + * + * where ! indicates where the current read_size limitation cuts + * through the filetype. I resolve this by reading up to !, but + * filling the communication buffer only for 1. I copy the portion + * left over for 2 into a tmp_buf for use in the next + * iteration. i.e., 2 and 3 will be satisfied in the next + * iteration. This simplifies filling in the user's buf at the + * other end, as only one off-len pair with incomplete data + * will be sent. I also don't need to send the individual + * offsets and lens along with the data, as the data is being + * sent in a particular order. */ + + /* off = start offset in the file for the data actually read in + * this iteration + * size = size of data read corresponding to off + * real_off = off minus whatever data was retained in memory from + * previous iteration for cases like 2, 3 illustrated above + * real_size = size plus the extra corresponding to real_off + * req_off = off in file for a particular contiguous request + * minus what was satisfied in previous iteration + * req_size = size corresponding to req_off */ + + size = MIN(coll_bufsize, end_loc - st_loc + 1 - done); + bool flag = false; + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + for (MPI_Count j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { + MPI_Offset req_off; + if (partial_send[i]) { + req_off = others_req[i].offsets[j] + partial_send[i]; + } else { + req_off = others_req[i].offsets[j]; + } + if (req_off < off + size) { + flag = true; + } + } + } + } + if (flag) { + /* This should be only reached by I/O aggregators only */ + r_len = PNCIO_ReadContig(fd, read_buf + for_curr_iter, size, off); + if (r_len < 0) return r_len; + size = r_len; + } + + real_off = off - for_curr_iter; + real_size = size + for_curr_iter; + + for (i = 0; i < nprocs; i++) + count[i] = send_size[i] = 0; + for_next_iter = 0; + + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + start_pos[i] = curr_offlen_ptr[i]; + MPI_Count j = 0; + for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { + MPI_Offset req_off; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset req_len; +#else + int req_len; +#endif + if (partial_send[i]) { + /* this request may have been partially + * satisfied in the previous iteration. */ + req_off = others_req[i].offsets[j] + partial_send[i]; + req_len = others_req[i].lens[j] - partial_send[i]; + partial_send[i] = 0; + /* modify the off-len pair to reflect this change */ + others_req[i].offsets[j] = req_off; + others_req[i].lens[j] = req_len; + } else { + req_off = others_req[i].offsets[j]; + req_len = others_req[i].lens[j]; + } + if (req_off < real_off + real_size) { + count[i]++; + MPI_Aint addr; + MPI_Get_address(read_buf + req_off - real_off, &addr); + others_req[i].mem_ptrs[j] = addr; + send_size[i] += (MIN(real_off + real_size - req_off, req_len)); + + if (real_off + real_size - req_off < req_len) { + partial_send[i] = (real_off + real_size - req_off); + if ((j + 1 < others_req[i].count) && + (others_req[i].offsets[j + 1] < real_off + real_size)) { + /* this is the case illustrated in the + * figure above. */ + for_next_iter = MAX(for_next_iter, + real_off + real_size - + others_req[i].offsets[j + 1]); + /* max because it must cover requests + * from different processes */ + } + break; + } + } else + break; + } + curr_offlen_ptr[i] = j; + } + } + + for_curr_iter = for_next_iter; + + MPI_Aint recved_bytes = 0; + R_Exchange_data(fd, buf, buf_view, send_size, recv_size, count, + start_pos, partial_send, recd_from_proc, nprocs, + myrank, min_st_offset, fd_size, fd_start, fd_end, + others_req, m, buf_idx, &recved_bytes); + actual_recved_bytes += recved_bytes; + + + if (for_next_iter) { + tmp_buf = (char *) NCI_Malloc(for_next_iter); + memcpy(tmp_buf, read_buf + real_size - for_next_iter, for_next_iter); + NCI_Free(fd->io_buf); + fd->io_buf = (char *) NCI_Malloc(for_next_iter + coll_bufsize); + memcpy(fd->io_buf, tmp_buf, for_next_iter); + read_buf = fd->io_buf; + NCI_Free(tmp_buf); + } + + off += size; + done += size; + } + + for (i = 0; i < nprocs; i++) + count[i] = send_size[i] = 0; + for (m = ntimes; m < max_ntimes; m++) { + /* nothing to send, but check for recv. */ + MPI_Aint recved_bytes = 0; + R_Exchange_data(fd, buf, buf_view, send_size, recv_size, count, + start_pos, partial_send, recd_from_proc, nprocs, + myrank, min_st_offset, fd_size, fd_start, fd_end, + others_req, m, buf_idx, &recved_bytes); + actual_recved_bytes += recved_bytes; + } + + NCI_Free(curr_offlen_ptr); + + return actual_recved_bytes; +} + +static void R_Exchange_data(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, + MPI_Count * send_size, MPI_Count * recv_size, + MPI_Count * count, MPI_Count * start_pos, + MPI_Count * partial_send, MPI_Count * recd_from_proc, int nprocs, + int myrank, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + PNCIO_Access * others_req, int iter, + MPI_Aint * buf_idx, MPI_Aint * actual_recved_bytes) +{ + int i, nprocs_recv, nprocs_send; + char **recv_buf = NULL; + size_t memLen; + MPI_Request *requests; + MPI_Datatype send_type; + MPI_Status *statuses; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double curT = MPI_Wtime(); +#endif + +/* exchange send_size info so that each process knows how much to + receive from whom and how much memory to allocate. */ + + MPI_Alltoall(send_size, 1, MPI_COUNT, recv_size, 1, MPI_COUNT, fd->comm); + + nprocs_recv = 0; + nprocs_send = 0; + memLen = 0; + for (i = 0; i < nprocs; i++) { + memLen += recv_size[i]; + if (recv_size[i]) + nprocs_recv++; + if (send_size[i]) + nprocs_send++; + } + + requests = (MPI_Request *) + NCI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Request)); +/* +1 to avoid a 0-size malloc */ + +/* post recvs. if buf_view.is_contig, data can be directly recd. into + user buf at location given by buf_idx. else use recv_buf. */ + + MPI_Count j = 0; // think of this as a counter of non-zero sends/recs + if (buf_view.is_contig) { + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Irecv_c(((char *) buf) + buf_idx[i], recv_size[i], + MPI_BYTE, i, 0, fd->comm, requests + j); +#else + MPI_Irecv(((char *) buf) + buf_idx[i], recv_size[i], + MPI_BYTE, i, 0, fd->comm, requests + j); +#endif + j++; + buf_idx[i] += recv_size[i]; + } + } + } else { + /* allocate memory for recv_buf and post receives */ + recv_buf = (char **) NCI_Malloc(nprocs * sizeof(char *)); + recv_buf[0] = (char *) NCI_Malloc(memLen); + for (i = 1; i < nprocs; i++) + recv_buf[i] = recv_buf[i - 1] + recv_size[i - 1]; + + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Irecv_c(recv_buf[i], recv_size[i], MPI_BYTE, i, + 0, fd->comm, requests + j); +#else + MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i, + 0, fd->comm, requests + j); +#endif + j++; + } + } + } + +/* create derived datatypes and send data */ + + j = 0; + for (i = 0; i < nprocs; i++) { + if (send_size[i]) { + /* take care if the last off-len pair is a partial send */ + MPI_Offset tmp = 0; + MPI_Count k = 0; + if (partial_send[i]) { + k = start_pos[i] + count[i] - 1; + tmp = others_req[i].lens[k]; + others_req[i].lens[k] = partial_send[i]; + } +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Type_create_hindexed_c(count[i], + &(others_req[i].lens[start_pos[i]]), + &(others_req[i].mem_ptrs[start_pos[i]]), + MPI_BYTE, &send_type); +#else + MPI_Type_create_hindexed(count[i], + &(others_req[i].lens[start_pos[i]]), + &(others_req[i].mem_ptrs[start_pos[i]]), + MPI_BYTE, &send_type); +#endif + /* absolute displacement; use MPI_BOTTOM in send */ + MPI_Type_commit(&send_type); + MPI_Isend(MPI_BOTTOM, 1, send_type, i, 0, + fd->comm, requests + nprocs_recv + j); + MPI_Type_free(&send_type); + if (partial_send[i]) + others_req[i].lens[k] = tmp; + j++; + } + } +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->read_timing[4] += MPI_Wtime() - curT; +#endif + + + /* +1 to avoid a 0-size malloc */ + statuses = (MPI_Status *) NCI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Status)); + + /* wait on the receives */ + if (nprocs_recv) { +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + curT = MPI_Wtime(); +#endif + MPI_Waitall(nprocs_recv, requests, statuses); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->read_timing[3] += MPI_Wtime() - curT; +#endif + + *actual_recved_bytes = 0; + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count count_recved; + MPI_Get_count_c(&statuses[j], MPI_BYTE, &count_recved); +#else + int count_recved; + MPI_Get_count(&statuses[j], MPI_BYTE, &count_recved); +#endif + *actual_recved_bytes += count_recved; + j++; + } + } + + /* if noncontiguous, to the copies from the recv buffers */ + if (!buf_view.is_contig) + Fill_user_buffer(fd, buf, buf_view, recv_buf, recv_size, + recd_from_proc, nprocs, min_st_offset, + fd_size, fd_start, fd_end); + } + + /* wait on the sends */ +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + curT = MPI_Wtime(); +#endif +#ifdef HAVE_MPI_STATUSES_IGNORE + MPI_Waitall(nprocs_send, requests + nprocs_recv, MPI_STATUSES_IGNORE); +#else + MPI_Waitall(nprocs_send, requests + nprocs_recv, statuses + nprocs_recv); +#endif +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->read_timing[3] += MPI_Wtime() - curT; +#endif + + NCI_Free(statuses); + NCI_Free(requests); + + if (!buf_view.is_contig) { + NCI_Free(recv_buf[0]); + NCI_Free(recv_buf); + } +} + +#define BUF_INCR { \ + while (buf_incr) { \ + size_in_buf = MIN(buf_incr, flat_buf_sz); \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + buf_incr -= size_in_buf; \ + if (buf_incr > 0 && flat_buf_sz == 0) { \ + flat_buf_idx++; \ + user_buf_idx = buf_view.off[flat_buf_idx]; \ + flat_buf_sz = buf_view.len[flat_buf_idx]; \ + } \ + } \ +} + + +#define BUF_COPY { \ + while (size) { \ + size_in_buf = MIN(size, flat_buf_sz); \ + memcpy(((char *) buf) + user_buf_idx, \ + &(recv_buf[p][recv_buf_idx[p]]), size_in_buf); \ + recv_buf_idx[p] += size_in_buf; \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + size -= size_in_buf; \ + buf_incr -= size_in_buf; \ + if (size > 0 && flat_buf_sz == 0) { \ + flat_buf_idx++; \ + user_buf_idx = buf_view.off[flat_buf_idx]; \ + flat_buf_sz = buf_view.len[flat_buf_idx]; \ + } \ + } \ + BUF_INCR \ +} + +static void Fill_user_buffer(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, + char **recv_buf, + MPI_Count * recv_size, + MPI_Count * recd_from_proc, int nprocs, + MPI_Offset min_st_offset, + MPI_Offset fd_size, MPI_Offset * fd_start, + MPI_Offset * fd_end) +{ + +/* this function is only called if buftype is not contig */ + + int p, flat_buf_idx; + MPI_Offset flat_buf_sz, size_in_buf, buf_incr, size; + MPI_Offset off, user_buf_idx; + MPI_Offset len, rem_len; + MPI_Count *curr_from_proc, *done_from_proc, *recv_buf_idx; + +/* curr_from_proc[p] = amount of data recd from proc. p that has already + been accounted for so far + done_from_proc[p] = amount of data already recd from proc. p and + filled into user buffer in previous iterations + user_buf_idx = current location in user buffer + recv_buf_idx[p] = current location in recv_buf of proc. p */ + /* combining these three related arrays into a single memory allocation + * (the "times 3" here) can help some highly noncontiguous workloads a bit */ + curr_from_proc = NCI_Malloc(nprocs * 3 * sizeof(*curr_from_proc)); + done_from_proc = curr_from_proc + nprocs; + recv_buf_idx = done_from_proc + nprocs; + + for (int i = 0; i < nprocs; i++) { + recv_buf_idx[i] = curr_from_proc[i] = 0; + done_from_proc[i] = recd_from_proc[i]; + } + + user_buf_idx = buf_view.off[0]; + flat_buf_idx = 0; + flat_buf_sz = buf_view.len[0]; + + /* flat_buf_idx = current index into flattened buftype + * flat_buf_sz = size of current contiguous component in + * flattened buf */ + + for (MPI_Count i = 0; i < fd->flat_file.count; i++) { + off = fd->flat_file.off[i]; + rem_len = fd->flat_file.len[i]; + + /* this request may span the file domains of more than one process */ + while (rem_len != 0) { + len = rem_len; + /* NOTE: len value is modified by PNCIO_Calc_aggregator() to be no + * longer than the single region that processor "p" is responsible + * for. + */ + p = PNCIO_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_end); + + if (recv_buf_idx[p] < recv_size[p]) { + if (curr_from_proc[p] + len > done_from_proc[p]) { + if (done_from_proc[p] > curr_from_proc[p]) { + size = MIN(curr_from_proc[p] + len - done_from_proc[p], + recv_size[p] - recv_buf_idx[p]); + buf_incr = done_from_proc[p] - curr_from_proc[p]; + BUF_INCR + buf_incr = curr_from_proc[p] + len - done_from_proc[p]; + curr_from_proc[p] = done_from_proc[p] + size; + BUF_COPY + } else { + size = MIN(len, recv_size[p] - recv_buf_idx[p]); + buf_incr = len; + curr_from_proc[p] += size; + BUF_COPY + } + } else { + curr_from_proc[p] += len; + buf_incr = len; + BUF_INCR + } + } else { + buf_incr = len; + BUF_INCR + } + off += len; + rem_len -= len; + } + } + for (int i = 0; i < nprocs; i++) + if (recv_size[i]) + recd_from_proc[i] = curr_from_proc[i]; + + NCI_Free(curr_from_proc); +} diff --git a/src/drivers/pncio/pncio_read_str.c b/src/drivers/pncio/pncio_read_str.c new file mode 100644 index 000000000..ae554c2fe --- /dev/null +++ b/src/drivers/pncio/pncio_read_str.c @@ -0,0 +1,259 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +#define BUFFERED_READ { \ + if (req_off >= readbuf_off + readbuf_len) { \ + readbuf_off = req_off; \ + readbuf_len = MIN(max_bufsize, end_offset-readbuf_off+1); \ + r_len = PNCIO_ReadContig(fd, readbuf, readbuf_len, readbuf_off); \ + if (r_len < 0) return r_len; \ + total_r_len += r_len; \ + } \ + while (req_len > readbuf_off + readbuf_len - req_off) { \ + partial_read = readbuf_off + readbuf_len - req_off; \ + tmp_buf = (char *) NCI_Malloc(partial_read); \ + memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read); \ + NCI_Free(readbuf); \ + readbuf = (char *) NCI_Malloc(partial_read + max_bufsize); \ + memcpy(readbuf, tmp_buf, partial_read); \ + NCI_Free(tmp_buf); \ + readbuf_off += readbuf_len-partial_read; \ + readbuf_len = partial_read + \ + MIN(max_bufsize, end_offset-readbuf_off+1); \ + r_len = PNCIO_ReadContig(fd, readbuf+partial_read, \ + readbuf_len-partial_read, \ + readbuf_off+partial_read); \ + if (r_len < 0) return r_len; \ + total_r_len += r_len; \ + } \ + memcpy((char*)buf+userbuf_off, readbuf+req_off-readbuf_off, req_len); \ +} + + +MPI_Offset PNCIO_GEN_ReadStrided(PNCIO_File *fd, + void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + char *readbuf, *tmp_buf, *value; + int i, j, k, st_index=0, info_flag; + + MPI_Aint max_bufsize, readbuf_len; + MPI_Offset i_offset, new_brd_size, brd_size, size, abs_off_in_filetype=0; + MPI_Offset new_frd_size, frd_size=0, st_frd_size, userbuf_off, req_len; + MPI_Offset sum, off, req_off, disp, end_offset=0, readbuf_off, start_off; + MPI_Offset r_len, total_r_len=0; + MPI_Count num, bufsize, partial_read; + +// printf("%s at %d:\n",__func__,__LINE__); + + if (fd->hints->ds_read == PNCIO_HINT_DISABLE) { + /* if user has disabled data sieving on reads, use naive + * approach instead. + */ + return PNCIO_GEN_ReadStrided_naive(fd, buf, buf_view, offset); + } + +/* This subroutine is entered with filetype being non-contiguous only */ +assert(fd->filetype == MPI_BYTE); +if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */ + + bufsize = buf_view.size; + + /* get max_bufsize from the info object. */ + value = (char *) NCI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); + MPI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); + max_bufsize = atoi(value); + NCI_Free(value); + + if (!buf_view.is_contig && fd->flat_file.is_contig) { + /* noncontiguous in memory, contiguous in file. */ + + off = fd->disp + offset; + + start_off = off; + end_offset = off + bufsize - 1; + readbuf_off = off; + readbuf = (char *) NCI_Malloc(max_bufsize); + readbuf_len = MIN(max_bufsize, end_offset - readbuf_off + 1); + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + r_len = PNCIO_ReadContig(fd, readbuf, readbuf_len, readbuf_off); + if (r_len < 0) return r_len; + + for (i = 0; i < buf_view.count; i++) { + userbuf_off = buf_view.off[i]; + req_off = off; + req_len = buf_view.len[i]; + BUFFERED_READ + off += buf_view.len[i]; + } + + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + NCI_Free(readbuf); + } + + else { /* noncontiguous in file */ + MPI_Offset size_in_filetype = offset; + + disp = fd->disp; + + sum = 0; + for (i = 0; i < fd->flat_file.count; i++) { + sum += fd->flat_file.len[i]; + if (sum > size_in_filetype) { + st_index = i; + frd_size = sum - size_in_filetype; + abs_off_in_filetype = fd->flat_file.off[i] + + size_in_filetype - (sum - fd->flat_file.len[i]); + break; + } + } + + /* abs. offset in bytes in the file */ + offset = disp + abs_off_in_filetype; + + start_off = offset; + + /* Wei-keng Liao: read request is within a single flat_file contig + * block e.g. with subarray types that actually describe the whole + * array */ + if (buf_view.is_contig && bufsize <= frd_size) { + /* a count of bytes can overflow. operate on original type instead */ + r_len = PNCIO_ReadContig(fd, buf, buf_view.size, offset); + +assert(buf_view.size == r_len); + return r_len; + } + + /* Calculate end_offset, the last byte-offset that will be accessed. + * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ + + st_frd_size = frd_size; + i_offset = 0; + j = st_index; + off = offset; + frd_size = MIN(st_frd_size, bufsize); + while (i_offset < bufsize) { + i_offset += frd_size; + end_offset = off + frd_size - 1; + +if (i_offset >= bufsize) break; + j++; + off = disp + fd->flat_file.off[j]; + frd_size = MIN(fd->flat_file.len[j], bufsize - i_offset); + } + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + readbuf_off = 0; + readbuf_len = 0; + readbuf = (char *) NCI_Malloc(max_bufsize); + + if (buf_view.is_contig && !fd->flat_file.is_contig) { + /* contiguous in memory, noncontiguous in file should be the most + * common case. + */ + i_offset = 0; + j = st_index; + off = offset; + frd_size = MIN(st_frd_size, bufsize); + while (i_offset < bufsize) { + if (frd_size) { + req_off = off; + req_len = frd_size; + userbuf_off = i_offset; + BUFFERED_READ + } + + i_offset += frd_size; + if (i_offset >= bufsize) break; + + if (off + frd_size < disp + fd->flat_file.off[j] + + fd->flat_file.len[j]) + off += frd_size; /* off is incremented by frd_size */ + else { + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + frd_size = MIN(fd->flat_file.len[j], + bufsize - i_offset); + } + } + } else { + /* noncontiguous in memory as well as in file */ + k = num = 0; + i_offset = buf_view.off[0]; + j = st_index; + off = offset; + frd_size = st_frd_size; + brd_size = buf_view.len[0]; + + while (num < bufsize) { + size = MIN(frd_size, brd_size); + if (size) { + req_off = off; + req_len = size; + userbuf_off = i_offset; + BUFFERED_READ + } + + num += size; + if (num >= bufsize) break; + + new_frd_size = frd_size; + new_brd_size = brd_size; + + if (size == frd_size) { + /* reached end of contiguous block in file */ + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + new_frd_size = fd->flat_file.len[j]; + if (size != brd_size) { + i_offset += size; + new_brd_size -= size; + } + } + + if (size == brd_size) { + /* reached end of contiguous block in memory */ + k++; +assert(k < buf_view.count); + i_offset = buf_view.off[k]; + new_brd_size = buf_view.len[k]; + if (size != frd_size) { + off += size; + new_frd_size -= size; + } + } + frd_size = new_frd_size; + brd_size = new_brd_size; + } + } + + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + NCI_Free(readbuf); /* malloced in the buffered_read macro */ + } + + assert(total_r_len >= buf_view.size); + + return buf_view.size; +} diff --git a/src/drivers/pncio/pncio_read_str_naive.c b/src/drivers/pncio/pncio_read_str_naive.c new file mode 100644 index 000000000..fa003f403 --- /dev/null +++ b/src/drivers/pncio/pncio_read_str_naive.c @@ -0,0 +1,246 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +MPI_Offset PNCIO_GEN_ReadStrided_naive(PNCIO_File *fd, + void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + int b_index; + MPI_Offset size, brd_size, frd_size=0, req_len, sum, off, req_off, disp; + MPI_Offset end_offset=0, start_off, abs_off_in_filetype=0, userbuf_off; + MPI_Offset r_len, total_r_len=0; + MPI_Count bufsize; + +// printf("%s at %d:\n",__func__,__LINE__); + + if (fd->flat_file.size == 0) + return 0; + + bufsize = buf_view.size; + + /* contiguous in buftype and filetype is handled elsewhere */ + + if (!buf_view.is_contig && fd->flat_file.is_contig) { + /* noncontiguous in memory, contiguous in file. */ + + off = fd->disp + offset; + + start_off = off; + end_offset = off + bufsize - 1; + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + /* for each region in the buffer, grab the data and put it in place */ + for (b_index = 0; b_index < buf_view.count; b_index++) { + userbuf_off = buf_view.off[b_index]; + req_off = off; + req_len = buf_view.len[b_index]; + + r_len = PNCIO_ReadContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (r_len < 0) return r_len; + total_r_len += r_len; + + /* off is (potentially) used to save the final offset later */ + off += buf_view.len[b_index]; + } + + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + } + else { /* noncontiguous in file */ + MPI_Offset size_in_filetype = offset; + + int f_index, st_index = 0; + MPI_Offset st_frd_size; + + /* First we're going to calculate a set of values for use in all + * the noncontiguous in file cases: + * start_off - starting byte position of data in file + * end_offset - last byte offset to be accessed in the file + * st_index - index of block in first filetype that we will be + * starting in (?) + * st_frd_size - size of the data in the first filetype block + * that we will read (accounts for being part-way + * into reading this block of the filetype + * + */ + + disp = fd->disp; + + sum = 0; + for (f_index = 0; f_index < fd->flat_file.count; f_index++) { + sum += fd->flat_file.len[f_index]; + if (sum > size_in_filetype) { + st_index = f_index; + frd_size = sum - size_in_filetype; + abs_off_in_filetype = fd->flat_file.off[f_index] + + size_in_filetype - (sum - fd->flat_file.len[f_index]); + break; + } + } + + /* abs. offset in bytes in the file */ + start_off = disp + abs_off_in_filetype; + + st_frd_size = frd_size; + + /* start_off, st_index, and st_frd_size are + * all calculated at this point + */ + + /* Calculate end_offset, the last byte-offset that will be accessed. + * e.g., if start_off=0 and 100 bytes to be read, end_offset=99 + */ + f_index = st_index; + userbuf_off = frd_size = MIN(st_frd_size, bufsize); + end_offset = start_off + frd_size - 1; + while (userbuf_off < bufsize) { + f_index++; +assert(f_index < fd->flat_file.count); + + off = disp + fd->flat_file.off[f_index]; + frd_size = MIN(fd->flat_file.len[f_index], + bufsize - userbuf_off); + userbuf_off += frd_size; + end_offset = off + frd_size - 1; + } + + /* End of calculations. At this point the following values have + * been calculated and are ready for use: + * - start_off + * - end_offset + * - st_index + * - st_frd_size + */ + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + if (buf_view.is_contig && !fd->flat_file.is_contig) { + /* contiguous in memory, noncontiguous in file. should be the + * most common case. + */ + + userbuf_off = 0; + f_index = st_index; + off = start_off; + frd_size = MIN(st_frd_size, bufsize); + + /* while there is still space in the buffer, read more data */ + while (userbuf_off < bufsize) { + if (frd_size) { + /* TYPE_UB and TYPE_LB can result in + * frd_size = 0. save system call in such cases */ + req_off = off; + req_len = frd_size; + + r_len = PNCIO_ReadContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (r_len < 0) return r_len; + total_r_len += r_len; + } + userbuf_off += frd_size; + if (userbuf_off >= bufsize) break; + + if (off + frd_size < disp + fd->flat_file.off[f_index] + + fd->flat_file.len[f_index]) { + /* important that this value be correct, as it is + * used to set the offset in the fd near the end of + * this function. + */ + off += frd_size; + } + /* did not reach end of contiguous block in filetype. + * no more I/O needed. off is incremented by frd_size. + */ + else { + f_index++; +assert(f_index < fd->flat_file.count); + off = disp + fd->flat_file.off[f_index]; + frd_size = MIN(fd->flat_file.len[f_index], + bufsize - userbuf_off); + } + } + } else { + MPI_Offset i_offset, tmp_bufsize = 0; + /* noncontiguous in memory as well as in file */ + + b_index = 0; + i_offset = buf_view.off[0]; + f_index = st_index; + off = start_off; + frd_size = st_frd_size; + brd_size = buf_view.len[0]; + + /* while we haven't read size * count bytes, keep going */ + while (tmp_bufsize < bufsize) { + MPI_Offset new_brd_size = brd_size, new_frd_size = frd_size; + + size = MIN(frd_size, brd_size); + /* keep max of a single read amount <= INT_MAX */ + size = MIN(size, INT_MAX); + + if (size) { + req_off = off; + req_len = size; + userbuf_off = i_offset; + + r_len = PNCIO_ReadContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (r_len < 0) return r_len; + total_r_len += r_len; + } + + tmp_bufsize += size; + if (tmp_bufsize >= bufsize) break; + + if (size == frd_size) { + /* reached end of contiguous block in file */ + f_index++; +assert(f_index < fd->flat_file.count); + off = disp + fd->flat_file.off[f_index]; + + new_frd_size = fd->flat_file.len[f_index]; + if (size != brd_size) { + i_offset += size; + new_brd_size -= size; + } + } + + if (size == brd_size) { + /* reached end of contiguous block in memory */ + b_index++; +assert(b_index < buf_view.count); + i_offset = buf_view.off[b_index]; + new_brd_size = buf_view.len[b_index]; + if (size != frd_size) { + off += size; + new_frd_size -= size; + } + } + frd_size = new_frd_size; + brd_size = new_brd_size; + } + } + + /* unlock the file region if we locked it */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + } /* end of (else noncontiguous in file) */ + + return total_r_len; +} diff --git a/src/drivers/pncio/pncio_set_size.c b/src/drivers/pncio/pncio_set_size.c new file mode 100644 index 000000000..77490f481 --- /dev/null +++ b/src/drivers/pncio/pncio_set_size.c @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* strdup() */ +#include +#include +#ifdef HAVE_UNISTD_H +#include /* ftruncate(), lseek() */ +#endif + +#include + +#include +#include +#include "pncio.h" + +/*----< PNCIO_File_set_size() >-----------------------------------------------*/ +int PNCIO_File_set_size(PNCIO_File *fd, + MPI_Offset size) +{ + int err = NC_NOERR, rank; + + MPI_Comm_rank(fd->comm, &rank); + + if (rank == 0) { + err = ftruncate(fd->fd_sys, (off_t) size); + if (err != 0) + err = ncmpii_error_posix2nc("ftruncate"); + } + + MPI_Bcast(&err, 1, MPI_INT, 0, fd->comm); + + return err; +} + +/*----< PNCIO_File_get_size() >-----------------------------------------------*/ +int PNCIO_File_get_size(PNCIO_File *fd, + MPI_Offset *size) +{ + int err = NC_NOERR, rank; + MPI_Offset msg[2]; + + MPI_Comm_rank(fd->comm, &rank); + + if (rank == 0) { + *size = lseek(fd->fd_sys, 0, SEEK_END); + if (*size == -1) + err = ncmpii_error_posix2nc("lseek"); + msg[0] = err; + msg[1] = *size; + } + + MPI_Bcast(msg, 2, MPI_OFFSET, 0, fd->comm); + err = (int)msg[0]; + *size = msg[1]; + + return err; +} + diff --git a/src/drivers/pncio/pncio_set_view.c b/src/drivers/pncio/pncio_set_view.c new file mode 100644 index 000000000..ddf41e968 --- /dev/null +++ b/src/drivers/pncio/pncio_set_view.c @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* strdup() */ +#include +#include + +#include + +#include +#include +#include "pncio.h" + +/*----< PNCIO_File_set_view() >-----------------------------------------------*/ +/* For PnetCDF, this subroutine is an independent call, because PnetCDF only + * use the followings. + * Argument etype is always MPI_BYTE. + * Argument datarep is always "native". + * Argument info is always MPI_INFO_NULL. + */ +int PNCIO_File_set_view(PNCIO_File *fd, + MPI_Offset disp, + MPI_Datatype filetype, + MPI_Aint npairs, +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count *offsets, + MPI_Count *lengths +#else + MPI_Offset *offsets, + int *lengths +#endif +) +{ + MPI_Aint i; + +assert(filetype == MPI_BYTE); +assert(disp == 0); +fd->filetype = filetype; +fd->disp = 0; + + fd->flat_file.count = npairs; + fd->flat_file.off = offsets; + fd->flat_file.len = lengths; + fd->flat_file.idx = 0; + fd->flat_file.rem = (npairs > 0) ? lengths[0] : 0; + + /* Size of fileview must be calculated here, as PnetCDF may coalesce the + * offset-length pairs in order to make offsets sorted in a monotonically + * non-decreasing order. + */ + fd->flat_file.size = 0; + for (i=0; iflat_file.size += lengths[i]; + + /* is_contig is redundant to (count <= 1), but convenient */ + fd->flat_file.is_contig = (npairs <= 1); + + return NC_NOERR; +} + diff --git a/src/drivers/pncio/pncio_sync.c b/src/drivers/pncio/pncio_sync.c new file mode 100644 index 000000000..49dc31bff --- /dev/null +++ b/src/drivers/pncio/pncio_sync.c @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include /* strdup() */ +#include +#include +#ifdef HAVE_UNISTD_H +#include /* fsync(), unlink(), ftruncate(), lseek() */ +#endif + +#include + +#include +#include +#include "pncio.h" + +/*----< PNCIO_File_sync() >---------------------------------------------------*/ +int PNCIO_File_sync(PNCIO_File *fd) +{ + int err = NC_NOERR; + + if (fd->is_open > 0) { + err = fsync(fd->fd_sys); + if (err != 0) + err = ncmpii_error_posix2nc("fsync"); + } + + return err; +} + diff --git a/src/drivers/pncio/pncio_utils.c b/src/drivers/pncio/pncio_utils.c new file mode 100644 index 000000000..c4c1629e1 --- /dev/null +++ b/src/drivers/pncio/pncio_utils.c @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include /* va_start(), va_end() */ + +#include + +/* some systems do not have pread/pwrite, or requrie XOPEN_SOURCE set higher + * than we would like. see #1973 */ +#if (HAVE_DECL_PWRITE == 0) + +#include +#include + +ssize_t pread(int fd, void *buf, size_t count, off_t offset); +ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset); + +ssize_t pread(int fd, void *buf, size_t count, off_t offset) +{ + off_t lseek_ret; + off_t old_offset; + ssize_t read_ret; + + old_offset = lseek(fd, 0, SEEK_CUR); + lseek_ret = lseek(fd, offset, SEEK_SET); + if (lseek_ret == -1) + return lseek_ret; + read_ret = read(fd, buf, count); + if (read_ret < 0) + return read_ret; + /* man page says "file offset is not changed" */ + if ((lseek_ret = lseek(fd, old_offset, SEEK_SET)) < 0) + return lseek_ret; + + return read_ret; +} + +ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset) +{ + off_t lseek_ret; + off_t old_offset; + ssize_t write_ret; + + old_offset = lseek(fd, 0, SEEK_CUR); + lseek_ret = lseek(fd, offset, SEEK_SET); + if (lseek_ret == -1) + return lseek_ret; + write_ret = write(fd, buf, count); + if (write_ret < 0) + return write_ret; + /* man page says "file offset is not changed" */ + if ((lseek_ret = lseek(fd, old_offset, SEEK_SET)) < 0) + return lseek_ret; + + return write_ret; +} +#endif + +void PNCIO_Heap_merge(PNCIO_Access * others_req, MPI_Count * count, + MPI_Offset * srt_off, MPI_Count * srt_len, MPI_Count * start_pos, + int nprocs, int nprocs_recv, MPI_Count total_elements) +{ + typedef struct { + MPI_Offset *off_list; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Offset *len_list; +#else + int *len_list; +#endif + MPI_Count nelem; + } heap_struct; + + heap_struct *a, tmp; + int i, j, heapsize, l, r, k, smallest; + + a = (heap_struct *) NCI_Malloc((nprocs_recv + 1) * sizeof(heap_struct)); + + j = 0; + for (i = 0; i < nprocs; i++) + if (count[i]) { + a[j].off_list = &(others_req[i].offsets[start_pos[i]]); + a[j].len_list = &(others_req[i].lens[start_pos[i]]); + a[j].nelem = count[i]; + j++; + } + + /* build a heap out of the first element from each list, with + * the smallest element of the heap at the root */ + + heapsize = nprocs_recv; + for (i = heapsize / 2 - 1; i >= 0; i--) { + /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 + * modified for a heap with smallest element at root. I have + * removed the recursion so that there are no function calls. + * Function calls are too expensive. */ + k = i; + for (;;) { + l = 2 * (k + 1) - 1; + r = 2 * (k + 1); + + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) + smallest = l; + else + smallest = k; + + if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list))) + smallest = r; + + if (smallest != k) { + tmp.off_list = a[k].off_list; + tmp.len_list = a[k].len_list; + tmp.nelem = a[k].nelem; + + a[k].off_list = a[smallest].off_list; + a[k].len_list = a[smallest].len_list; + a[k].nelem = a[smallest].nelem; + + a[smallest].off_list = tmp.off_list; + a[smallest].len_list = tmp.len_list; + a[smallest].nelem = tmp.nelem; + + k = smallest; + } else + break; + } + } + + for (i = 0; i < total_elements; i++) { + /* extract smallest element from heap, i.e. the root */ + srt_off[i] = *(a[0].off_list); + srt_len[i] = *(a[0].len_list); + (a[0].nelem)--; + + if (!a[0].nelem) { + a[0].off_list = a[heapsize - 1].off_list; + a[0].len_list = a[heapsize - 1].len_list; + a[0].nelem = a[heapsize - 1].nelem; + heapsize--; + } else { + (a[0].off_list)++; + (a[0].len_list)++; + } + + /* Heapify(a, 0, heapsize); */ + k = 0; + for (;;) { + l = 2 * (k + 1) - 1; + r = 2 * (k + 1); + + if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) + smallest = l; + else + smallest = k; + + if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list))) + smallest = r; + + if (smallest != k) { + tmp.off_list = a[k].off_list; + tmp.len_list = a[k].len_list; + tmp.nelem = a[k].nelem; + + a[k].off_list = a[smallest].off_list; + a[k].len_list = a[smallest].len_list; + a[k].nelem = a[smallest].nelem; + + a[smallest].off_list = tmp.off_list; + a[smallest].len_list = tmp.len_list; + a[smallest].nelem = tmp.nelem; + + k = smallest; + } else + break; + } + } + NCI_Free(a); +} + diff --git a/src/drivers/pncio/pncio_write.c b/src/drivers/pncio/pncio_write.c new file mode 100644 index 000000000..debd07e1d --- /dev/null +++ b/src/drivers/pncio/pncio_write.c @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2025, Northwestern University + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include /* pwrite() */ + +#include + +#include "pncio.h" + +#ifdef WKL_DEBUG +int first_ost_id; +#endif + +/*----< PNCIO_WriteContig() >-------------------------------------------------*/ +MPI_Offset PNCIO_WriteContig(PNCIO_File *fd, + const void *buf, + MPI_Offset w_size, + MPI_Offset offset) +{ + ssize_t err = 0; + size_t w_count; + MPI_Offset bytes_xfered = 0; + char *p; + + if (w_size == 0) return NC_NOERR; + +// printf("%s at %d: pwrite offset=%lld w_size=%lld\n",__func__,__LINE__,offset,w_size); +#ifdef WKL_DEBUG +int rank; MPI_Comm_rank(MPI_COMM_WORLD,&rank); + +MPI_Offset ost_id = (offset / fd->hints->striping_unit) % fd->hints->striping_factor; + if (first_ost_id == -1) { + first_ost_id = ost_id; + // printf("%2d %s file %s First pwrite offset=%lld OST %d\n",rank,__func__,fd->filename,offset,first_ost_id); + } + else if (ost_id != first_ost_id) + printf("%2d Error: %s pwrite offset=%lld w_size=%lld ost_id=%lld not same 1st ost %d\n",rank,__func__,offset,w_size,ost_id,first_ost_id); + +printf("%s line %d: disp=%lld offset=%lld count=%ld bufType_size=%d w_size=%lld\n",__func__,__LINE__,fd->disp,offset,count,bufType_size,w_size); + + printf("%2d %s line %d pwrite offset=%lld w_size=%lld\n",rank,__func__,__LINE__,offset,w_size); +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double timing = MPI_Wtime(); +#endif + p = (char *) buf; + while (bytes_xfered < w_size) { + w_count = w_size - bytes_xfered; + err = pwrite(fd->fd_sys, p, w_count, offset + bytes_xfered); + if (err == -1) + goto ioerr; + if (err == 0) + break; + bytes_xfered += err; + p += err; + } +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->write_timing[2] += MPI_Wtime() - timing; +#endif + +ioerr: + if (err == -1) + bytes_xfered = ncmpii_error_posix2nc("pwrite"); + + return bytes_xfered; +} + +/*----< file_write() >-------------------------------------------------------*/ +/* This is an independent call. */ +static +MPI_Offset file_write(PNCIO_File *fd, + MPI_Offset offset, + const void *buf, + PNCIO_View buf_view) +{ + MPI_Offset w_len; + + if (buf_view.size == 0) /* zero-sized request */ + return NC_NOERR; + +assert(fd->filetype == MPI_BYTE); + + if (buf_view.is_contig && fd->flat_file.is_contig) { + if (fd->flat_file.count > 0) offset += fd->flat_file.off[0]; + w_len = PNCIO_WriteContig(fd, buf, buf_view.size, offset); + } + else if (fd->file_system == PNCIO_LUSTRE) + w_len = PNCIO_LUSTRE_WriteStrided(fd, buf, buf_view, offset); + else if (fd->file_system == PNCIO_UFS) + w_len = PNCIO_GEN_WriteStrided(fd, buf, buf_view, offset); + else + return NC_EFSTYPE; + + return w_len; /* when w_len < 0, it is an NetCDF error code */ +} + +/*----< PNCIO_File_write_at() >-----------------------------------------------*/ +/* This is an independent call. + * offset is a position in the file relative to the current view, expressed as + * a count of etypes. + */ +MPI_Offset PNCIO_File_write_at(PNCIO_File *fh, + MPI_Offset offset, + const void *buf, + PNCIO_View buf_view) +{ + assert(fh != NULL); + + if (buf_view.size == 0) /* zero-sized request */ + return NC_NOERR; + + if (buf_view.size < 0) return NC_ENEGATIVECNT; + + if (fh->access_mode & MPI_MODE_RDONLY) + return NC_EPERM; + + return file_write(fh, offset, buf, buf_view); +} + +/*----< PNCIO_File_write_at_all() >-------------------------------------------*/ +/* This is a collective call. + * offset is a position in the file relative to the current view, expressed as + * a count of etypes. + */ +MPI_Offset PNCIO_File_write_at_all(PNCIO_File *fh, + MPI_Offset offset, + const void *buf, + PNCIO_View buf_view) +{ + int err=NC_NOERR; + MPI_Offset w_len; + + assert(fh != NULL); + + if (buf_view.size < 0) err = NC_ENEGATIVECNT; + + if (fh->access_mode & MPI_MODE_RDONLY && err == NC_NOERR) + err = NC_EPERM; + + if (fh->file_system == PNCIO_LUSTRE) + w_len = PNCIO_LUSTRE_WriteStridedColl(fh, buf, buf_view, offset); + else if (fh->file_system == PNCIO_UFS) + w_len = PNCIO_GEN_WriteStridedColl(fh, buf, buf_view, offset); + else + return NC_EFSTYPE; + + return (err == NC_NOERR) ? w_len : err; +} + + diff --git a/src/drivers/pncio/pncio_write_coll.c b/src/drivers/pncio/pncio_write_coll.c new file mode 100644 index 000000000..d6126c5ce --- /dev/null +++ b/src/drivers/pncio/pncio_write_coll.c @@ -0,0 +1,898 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include "pncio.h" + +/* prototypes of functions used for collective writes only. */ +static MPI_Offset Exch_and_write(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, int nprocs, int myrank, + PNCIO_Access *others_req, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + MPI_Aint * buf_idx); + +static MPI_Offset W_Exchange_data(PNCIO_File *fd, void *buf, char *write_buf, + PNCIO_View buf_view, + MPI_Count * send_size, MPI_Count * recv_size, + MPI_Offset off, MPI_Count size, /* 10 */ + MPI_Count * count, MPI_Count * start_pos, + MPI_Count * partial_recv, MPI_Count * + sent_to_proc, int nprocs, + int myrank, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + PNCIO_Access * others_req, + MPI_Count *send_buf_idx, MPI_Count *curr_to_proc, + MPI_Count *done_to_proc, int *hole, int iter, + MPI_Aint * buf_idx); + +static void Fill_send_buffer(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, char **send_buf, + MPI_Count *send_size, MPI_Request *requests, + MPI_Count *sent_to_proc, int nprocs, int myrank, + MPI_Offset min_st_offset, + MPI_Offset fd_size, MPI_Offset *fd_start, + MPI_Offset *fd_end, MPI_Count *send_buf_idx, + MPI_Count *curr_to_proc, MPI_Count *done_to_proc, int iter); + +MPI_Offset PNCIO_GEN_WriteStridedColl(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + MPI_Offset offset) /* relative to fileview */ +{ + /* Uses a generalized version of the extended two-phase method described in + * "An Extended Two-Phase Method for Accessing Sections of Out-of-Core + * Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming, + * (5)4:301--317, Winter 1996. + * http://www.mcs.anl.gov/home/thakur/ext2ph.ps + */ + + PNCIO_Access *my_req; + /* array of nprocs access structures, one for each other process in + * whose file domain this process's request lies */ + + PNCIO_Access *others_req; + /* array of nprocs access structures, one for each other process + * whose request lies in this process's file domain. */ + + int i, nprocs, nprocs_for_coll, myrank, interleave_count=0; + MPI_Aint *buf_idx = NULL; + MPI_Count *count_my_req_per_proc, count_my_req_procs; + MPI_Count *count_others_req_per_proc, count_others_req_procs; + MPI_Offset start_offset, end_offset, fd_size, min_st_offset; + MPI_Offset *st_offsets=NULL, *fd_start=NULL; + MPI_Offset *fd_end=NULL, *end_offsets=NULL, w_len=0; + +// printf("%s at %d: offset=%lld buf_view.size=%lld\n",__func__,__LINE__, offset,buf_view.size); + + MPI_Comm_size(fd->comm, &nprocs); + MPI_Comm_rank(fd->comm, &myrank); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +double curT = MPI_Wtime(); +#endif + + /* the number of processes that actually perform I/O, nprocs_for_coll, is + * stored in the hints off the PNCIO_File structure + */ + nprocs_for_coll = fd->hints->cb_nodes; + + /* only check for interleaving if cb_write isn't disabled */ + if (fd->hints->cb_write != PNCIO_HINT_DISABLE) { + /* For this process's request, calculate the file start and end + * offsets. Note: end_offset points to the last byte-offset that will + * be accessed, e.g., if start_offset=0 and 100 bytes to be read, + * end_offset=99. + */ + if (fd->flat_file.size == 0) { + start_offset = 0; + end_offset = -1; + } + else if (fd->flat_file.count > 0) { + start_offset = offset + fd->flat_file.off[0]; + end_offset = fd->flat_file.off[fd->flat_file.count-1] + + fd->flat_file.len[fd->flat_file.count-1] - 1; + } + else { + start_offset = offset; + end_offset = offset + fd->flat_file.size - 1; + } + + /* Each process communicates its start and end offsets to other + * processes. The result is an array each of start and end offsets + * stored in order of process rank. + */ + + st_offsets = (MPI_Offset *) NCI_Malloc(nprocs * 2 * sizeof(MPI_Offset)); + end_offsets = st_offsets + nprocs; + + MPI_Allgather(&start_offset, 1, MPI_OFFSET, st_offsets, 1, MPI_OFFSET, + fd->comm); + MPI_Allgather(&end_offset, 1, MPI_OFFSET, end_offsets, 1, MPI_OFFSET, + fd->comm); + + /* Are the accesses of different processes interleaved? Below is a + * rudimentary check for interleaving, but should suffice for the + * moment. + */ + for (i = 1; i < nprocs; i++) + if (st_offsets[i] < end_offsets[i - 1] && + st_offsets[i] <= end_offsets[i]) + interleave_count++; + } + + if (fd->hints->cb_write == PNCIO_HINT_DISABLE || + (!interleave_count && (fd->hints->cb_write == PNCIO_HINT_AUTO))) { + + /* use independent accesses */ + if (fd->hints->cb_write != PNCIO_HINT_DISABLE) + NCI_Free(st_offsets); + if (buf_view.size == 0) return 0; + + /* offset is relative to fileview */ +if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */ + + if (buf_view.is_contig && fd->flat_file.is_contig) { + if (fd->flat_file.count > 0) offset += fd->flat_file.off[0]; + w_len = PNCIO_WriteContig(fd, buf, buf_view.size, offset); + } + else + w_len = PNCIO_GEN_WriteStrided(fd, buf, buf_view, offset); + + return w_len; + } + +// printf("%s at %d:\n",__func__,__LINE__); +/* Divide the I/O workload among "nprocs_for_coll" processes. This is + done by (logically) dividing the file into file domains (FDs); each + process may directly access only its own file domain. */ + + PNCIO_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll, + &min_st_offset, &fd_start, &fd_end, &fd_size, + fd->hints->striping_unit); + +/* calculate what portions of the access requests of this process are + located in what file domains */ + + PNCIO_Calc_my_req(fd, min_st_offset, fd_start, fd_end, fd_size, nprocs, + &count_my_req_procs, &count_my_req_per_proc, &my_req, + &buf_idx); + +/* based on everyone's my_req, calculate what requests of other + processes lie in this process's file domain. + count_others_req_procs = number of processes whose requests lie in + this process's file domain (including this process itself) + count_others_req_per_proc[i] indicates how many separate contiguous + requests of proc. i lie in this process's file domain. */ + + PNCIO_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc, + my_req, nprocs, myrank, &count_others_req_procs, + &count_others_req_per_proc, &others_req); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[1] += MPI_Wtime() - curT; +#endif + +/* exchange data and write in sizes of no more than coll_bufsize. */ + /* Cast away const'ness for the below function */ + w_len = Exch_and_write(fd, (char *) buf, buf_view, nprocs, myrank, + others_req, min_st_offset, fd_size, fd_start, + fd_end, buf_idx); + + /* If this collective write is followed by an independent write, + * it's possible to have those subsequent writes on other processes + * race ahead and sneak in before the read-modify-write completes. + * We carry out a collective communication at the end here so no one + * can start independent i/o before collective I/O completes. + * + * need to do some gymnastics with the error codes so that if something + * went wrong, all processes report error, but if a process has a more + * specific error code, we can still have that process report the + * additional information */ + + /* optimization: if only one process performing i/o, we can perform + * a less-expensive Bcast + */ + if (fd->hints->cb_nodes == 1) + MPI_Bcast(&w_len, 1, MPI_OFFSET, fd->hints->ranklist[0], fd->comm); + else + MPI_Allreduce(MPI_IN_PLACE, &w_len, 1, MPI_OFFSET, MPI_MIN, fd->comm); + + /* free all memory allocated for collective I/O */ + PNCIO_Free_my_req(count_my_req_per_proc, my_req, buf_idx); + PNCIO_Free_others_req(count_others_req_per_proc, others_req); + + NCI_Free(st_offsets); + NCI_Free(fd_start); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[0] += MPI_Wtime() - curT; +#endif + + /* w_len may not be the same as buf_view.size, because data sieving may + * write more than requested. + */ + return buf_view.size; +} + +/* If successful, it returns the amount written. Otherwise a NetCDF error code + * (negative value) is returned. + */ +static +MPI_Offset Exch_and_write(PNCIO_File *fd, void *buf, PNCIO_View buf_view, + int nprocs, + int myrank, + PNCIO_Access *others_req, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + MPI_Aint * buf_idx) +{ +/* Send data to appropriate processes and write in sizes of no more + than coll_bufsize. + The idea is to reduce the amount of extra memory required for + collective I/O. If all data were written all at once, which is much + easier, it would require temp space more than the size of user_buf, + which is often unacceptable. For example, to write a distributed + array to a file, where each local array is 8Mbytes, requiring + at least another 8Mbytes of temp space is unacceptable. */ + + /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */ + MPI_Offset size=0, w_len, total_w_len=0; + int hole, i, m, ntimes, max_ntimes; + MPI_Offset st_loc = -1, end_loc = -1, off, done, req_off; + char *write_buf = NULL; + MPI_Count *curr_offlen_ptr, *send_size, *count, req_len, *recv_size; + MPI_Count *partial_recv, *sent_to_proc, *start_pos; + int flag; + MPI_Count *send_buf_idx, *curr_to_proc, *done_to_proc; + int info_flag; + MPI_Aint coll_bufsize; + char *value; + + /* only I/O errors are currently reported */ + +/* calculate the number of writes of size coll_bufsize + to be done by each process and the max among all processes. + That gives the no. of communication phases as well. */ + + value = (char *) NCI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); + MPI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag); + coll_bufsize = atoi(value); + NCI_Free(value); + + + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + st_loc = others_req[i].offsets[0]; + end_loc = others_req[i].offsets[0]; + break; + } + } + + for (i = 0; i < nprocs; i++) + for (MPI_Count j = 0; j < others_req[i].count; j++) { + st_loc = MIN(st_loc, others_req[i].offsets[j]); + end_loc = MAX(end_loc, (others_req[i].offsets[j] + + others_req[i].lens[j] - 1)); + } + +/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/ + + ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize); + + if ((st_loc == -1) && (end_loc == -1)) { + ntimes = 0; /* this process does no writing. */ + } + + MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm); + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + fd->write_counter[0] = MAX(fd->write_counter[0], max_ntimes); +#endif + + write_buf = fd->io_buf; + + curr_offlen_ptr = NCI_Calloc(nprocs * 10, sizeof(*curr_offlen_ptr)); + /* its use is explained below. calloc initializes to 0. */ + + count = curr_offlen_ptr + nprocs; + /* to store count of how many off-len pairs per proc are satisfied + * in an iteration. */ + + partial_recv = count + nprocs; + /* if only a portion of the last off-len pair is recd. from a process + * in a particular iteration, the length recd. is stored here. + * calloc initializes to 0. */ + + send_size = partial_recv + nprocs; + /* total size of data to be sent to each proc. in an iteration. + * Of size nprocs so that I can use MPI_Alltoall later. */ + + recv_size = send_size + nprocs; + /* total size of data to be recd. from each proc. in an iteration. */ + + sent_to_proc = recv_size + nprocs; + /* amount of data sent to each proc so far. Used in + * Fill_send_buffer. initialized to 0 here. */ + + send_buf_idx = sent_to_proc + nprocs; + curr_to_proc = send_buf_idx + nprocs; + done_to_proc = curr_to_proc + nprocs; + /* Above three are used in Fill_send_buffer */ + + start_pos = done_to_proc + nprocs; + /* used to store the starting value of curr_offlen_ptr[i] in + * this iteration */ + + done = 0; + off = st_loc; +// printf("%s at %d: off=%lld buf_view.size=%lld ntimes=%d\n",__func__,__LINE__, off,buf_view.size,ntimes); + + for (m = 0; m < ntimes; m++) { + /* go through all others_req and check which will be satisfied + * by the current write */ + + /* Note that MPI guarantees that displacements in filetypes are in + * monotonically nondecreasing order and that, for writes, the + * filetypes cannot specify overlapping regions in the file. This + * simplifies implementation a bit compared to reads. */ + + /* off = start offset in the file for the data to be written in + * this iteration + * size = size of data written (bytes) corresponding to off + * req_off = off in file for a particular contiguous request + * minus what was satisfied in previous iteration + * req_size = size corresponding to req_off */ + + /* first calculate what should be communicated */ + + for (i = 0; i < nprocs; i++) + count[i] = recv_size[i] = 0; + + size = MIN(coll_bufsize, end_loc - st_loc + 1 - done); + + for (i = 0; i < nprocs; i++) { + if (others_req[i].count) { + start_pos[i] = curr_offlen_ptr[i]; + MPI_Count j; + for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) { + if (partial_recv[i]) { + /* this request may have been partially + * satisfied in the previous iteration. */ + req_off = others_req[i].offsets[j] + partial_recv[i]; + req_len = others_req[i].lens[j] - partial_recv[i]; + partial_recv[i] = 0; + /* modify the off-len pair to reflect this change */ + others_req[i].offsets[j] = req_off; + others_req[i].lens[j] = req_len; + } else { + req_off = others_req[i].offsets[j]; + req_len = others_req[i].lens[j]; + } + if (req_off < off + size) { + count[i]++; + if (myrank != i) { + MPI_Aint addr; + MPI_Get_address(write_buf + req_off - off, &addr); + others_req[i].mem_ptrs[j] = addr; + } + else + others_req[i].mem_ptrs[j] = req_off - off; + recv_size[i] += MIN(off + size - req_off, req_len); + + if (off + size - req_off < req_len) { + partial_recv[i] = (off + size - req_off); + + /* --BEGIN ERROR HANDLING-- */ + if ((j + 1 < others_req[i].count) && + (others_req[i].offsets[j + 1] < off + size)) { + /* This error should not happen to PnetCDF, as + * fileview is checked before entering this + * subroutine. + */ + fprintf(stderr, "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification\n"); + /* allow to continue since additional + * communication might have to occur + */ + return NC_EFILE; + } + /* --END ERROR HANDLING-- */ + break; + } + } else + break; + } + curr_offlen_ptr[i] = j; + } + } + + w_len = W_Exchange_data(fd, buf, write_buf, buf_view, send_size, + recv_size, off, size, count, start_pos, + partial_recv, sent_to_proc, nprocs, myrank, + min_st_offset, fd_size, fd_start, fd_end, + others_req, send_buf_idx, curr_to_proc, + done_to_proc, &hole, m, buf_idx); + + if (w_len < 0) + return w_len; + else + total_w_len += w_len; + + flag = 0; + for (i = 0; i < nprocs; i++) + if (count[i]) + flag = 1; + + if (flag) { + w_len = PNCIO_WriteContig(fd, write_buf, size, off); + if (w_len < 0) + return w_len; + else + total_w_len += w_len; + } + + off += size; + done += size; + } + + for (i = 0; i < nprocs; i++) + count[i] = recv_size[i] = 0; + for (m = ntimes; m < max_ntimes; m++) { + /* nothing to recv, but check for send. */ + w_len = W_Exchange_data(fd, buf, write_buf, buf_view, send_size, + recv_size, off, size, count, start_pos, + partial_recv, sent_to_proc, nprocs, myrank, + min_st_offset, fd_size, fd_start, fd_end, + others_req, send_buf_idx, curr_to_proc, + done_to_proc, &hole, m, buf_idx); + if (w_len < 0) + return w_len; + else + total_w_len += w_len; + } + + NCI_Free(curr_offlen_ptr); + + return total_w_len; +} + + +/* Sets error_code to MPI_SUCCESS if successful, or creates an error code + * in the case of error. + */ +static +MPI_Offset W_Exchange_data(PNCIO_File *fd, void *buf, char *write_buf, + PNCIO_View buf_view, + MPI_Count *send_size, MPI_Count *recv_size, + MPI_Offset off, MPI_Count size, + MPI_Count *count, MPI_Count * start_pos, + MPI_Count *partial_recv, + MPI_Count *sent_to_proc, int nprocs, + int myrank, + MPI_Offset min_st_offset, + MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + PNCIO_Access * others_req, + MPI_Count * send_buf_idx, MPI_Count * curr_to_proc, + MPI_Count * done_to_proc, int *hole, int iter, + MPI_Aint *buf_idx) +{ + int i, j, nprocs_recv, nprocs_send, err=NC_NOERR; + MPI_Count *tmp_len; + char **send_buf = NULL; + MPI_Request *requests, *send_req; + MPI_Datatype *recv_types, self_recv_type = MPI_DATATYPE_NULL; + MPI_Status *statuses, status; + MPI_Count sum, *srt_len = NULL; + int num_rtypes, nreqs; + MPI_Offset *srt_off = NULL; + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) +double curT = MPI_Wtime(); +#endif + +/* exchange recv_size info so that each process knows how much to + send to whom. */ + + MPI_Alltoall(recv_size, 1, MPI_COUNT, send_size, 1, MPI_COUNT, fd->comm); + + /* create derived datatypes for recv */ + + nprocs_send = 0; + nprocs_recv = 0; + sum = 0; + for (i = 0; i < nprocs; i++) { + sum += count[i]; + if (recv_size[i]) + nprocs_recv++; + if (send_size[i]) + nprocs_send++; + } + + recv_types = (MPI_Datatype *) NCI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype)); + /* +1 to avoid a 0-size malloc */ + + tmp_len = NCI_Malloc(nprocs * sizeof(*tmp_len)); + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i]) { + MPI_Datatype *dtype; + dtype = (i != myrank) ? (recv_types + j) : (&self_recv_type); + + if (partial_recv[i]) { + /* take care if the last off-len pair is a partial recv */ + MPI_Count k = start_pos[i] + count[i] - 1; + tmp_len[i] = others_req[i].lens[k]; + others_req[i].lens[k] = partial_recv[i]; + } +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Type_create_hindexed_c(count[i], + &(others_req[i].lens[start_pos[i]]), + &(others_req[i].mem_ptrs[start_pos[i]]), + MPI_BYTE, dtype); +#else + MPI_Type_create_hindexed(count[i], + &(others_req[i].lens[start_pos[i]]), + &(others_req[i].mem_ptrs[start_pos[i]]), + MPI_BYTE, dtype); +#endif + /* absolute displacements; use MPI_BOTTOM in recv */ + MPI_Type_commit(dtype); + if (i != myrank) + j++; + } + } + num_rtypes = j; /* number of non-self receive datatypes created */ + + /* To avoid a read-modify-write, check if there are holes in the + * data to be written. For this, merge the (sorted) offset lists + * others_req using a heap-merge. */ + +/* TODO: PNCIO_Heap_merge is expensive, borrow codes from ad_lustre_wrcoll.c to skip it when possible */ + + /* valgrind-detcted optimization: if there is no work on this process we do + * not need to search for holes */ + if (sum) { +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + double timing = MPI_Wtime(); +#endif + srt_off = (MPI_Offset *) NCI_Malloc(sum * sizeof(MPI_Offset)); + srt_len = NCI_Malloc(sum * sizeof(*srt_len)); + + PNCIO_Heap_merge(others_req, count, srt_off, srt_len, start_pos, + nprocs, nprocs_recv, sum); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[5] += MPI_Wtime() - timing; +#endif + } + + /* for partial recvs, restore original lengths */ + for (i = 0; i < nprocs; i++) + if (partial_recv[i]) { + MPI_Count k = start_pos[i] + count[i] - 1; + others_req[i].lens[k] = tmp_len[i]; + } + NCI_Free(tmp_len); + + /* check if there are any holes. If yes, must do read-modify-write. + * holes can be in three places. 'middle' is what you'd expect: the + * processes are operating on noncontigous data. But holes can also show + * up at the beginning or end of the file domain (see John Bent ROMIO REQ + * #835). Missing these holes would result in us writing more data than + * received by everyone else. */ + + *hole = 0; + if (sum) { + if (off != srt_off[0]) /* hole at the front */ + *hole = 1; + else { /* coalesce the sorted offset-length pairs */ + for (i = 1; i < sum; i++) { + if (srt_off[i] <= srt_off[0] + srt_len[0]) { + MPI_Count new_len = srt_off[i] + srt_len[i] - srt_off[0]; + if (new_len > srt_len[0]) + srt_len[0] = new_len; + } else + break; + } + if (i < sum || size != srt_len[0]) /* hole in middle or end */ + *hole = 1; + } + + NCI_Free(srt_off); + NCI_Free(srt_len); + } + + if (nprocs_recv) { + if (*hole) { + MPI_Offset r_len; + r_len = PNCIO_ReadContig(fd, write_buf, size, off); + if (r_len < 0) return r_len; + } + } + + if (fd->atomicity) { + /* nreqs is the number of Isend and Irecv to be posted */ + nreqs = (send_size[myrank]) ? (nprocs_send - 1) : nprocs_send; + requests = (MPI_Request *) NCI_Malloc((nreqs + 1) * sizeof(MPI_Request)); + send_req = requests; + } else { + nreqs = nprocs_send + nprocs_recv; + if (send_size[myrank]) /* NO send to and recv from self */ + nreqs -= 2; + requests = (MPI_Request *) NCI_Malloc((nreqs + 1) * sizeof(MPI_Request)); + /* +1 to avoid a 0-size malloc */ + + /* post receives */ + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i] == 0) + continue; + if (i != myrank) { + MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, 0, + fd->comm, requests + j); + j++; + } else if (buf_view.is_contig) { + /* sen/recv to/from self uses MPI_Unpack() */ +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count position=0; + MPI_Unpack_c((char *) buf + buf_idx[i], recv_size[i], &position, + write_buf, 1, self_recv_type, MPI_COMM_SELF); +#else + int position = 0; + assert(recv_size[i] < INT_MAX); + MPI_Unpack((char *) buf + buf_idx[i], (int)recv_size[i], &position, + write_buf, 1, self_recv_type, MPI_COMM_SELF); +#endif + buf_idx[i] += recv_size[i]; + } + } + send_req = requests + j; + } + +/* post sends. if buf_view.is_contig, data can be directly sent from + user buf at location given by buf_idx. else use send_buf. */ + + if (buf_view.is_contig) { + j = 0; + for (i = 0; i < nprocs; i++) + if (send_size[i] && i != myrank) { + assert(buf_idx[i] != -1); +#if MPI_VERSION >= 4 + MPI_Isend_c((char *) buf + buf_idx[i], send_size[i], + MPI_BYTE, i, 0, fd->comm, send_req + j); +#else + MPI_Isend((char *) buf + buf_idx[i], send_size[i], + MPI_BYTE, i, 0, fd->comm, send_req + j); +#endif + j++; + buf_idx[i] += send_size[i]; + } + } else if (nprocs_send) { + /* buftype is not contig */ + size_t msgLen = 0; + for (i = 0; i < nprocs; i++) + msgLen += send_size[i]; + send_buf = (char **) NCI_Malloc(nprocs * sizeof(char *)); + send_buf[0] = (char *) NCI_Malloc(msgLen * sizeof(char)); + for (i = 1; i < nprocs; i++) + send_buf[i] = send_buf[i - 1] + send_size[i - 1]; + + Fill_send_buffer(fd, buf, buf_view, send_buf, send_size, send_req, + sent_to_proc, nprocs, myrank, min_st_offset, fd_size, + fd_start, fd_end, send_buf_idx, curr_to_proc, + done_to_proc, iter); + + /* the send is done in Fill_send_buffer */ + } + + if (fd->atomicity) { + /* In atomic mode, we must use blocking receives to receive data in the + * same increasing order of MPI process rank IDs, + */ + j = 0; + for (i = 0; i < nprocs; i++) { + if (recv_size[i] == 0) + continue; + if (i != myrank) { + MPI_Recv(MPI_BOTTOM, 1, recv_types[j++], i, 0, + fd->comm, &status); + } else { + /* sen/recv to/from self uses MPI_Unpack() */ + char *ptr = (buf_view.is_contig) ? (char *) buf + buf_idx[i] : send_buf[i]; +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count position=0; + MPI_Unpack_c(ptr, recv_size[i], &position, write_buf, 1, self_recv_type, + MPI_COMM_SELF); +#else + int position = 0; + assert(recv_size[i] < INT_MAX); + MPI_Unpack(ptr, (int)recv_size[i], &position, write_buf, 1, self_recv_type, + MPI_COMM_SELF); +#endif + buf_idx[i] += recv_size[i]; + } + } + } else if (!buf_view.is_contig && recv_size[myrank]) { +#ifdef HAVE_MPI_LARGE_COUNT + MPI_Count position=0; + MPI_Unpack_c(send_buf[myrank], recv_size[myrank], &position, write_buf, 1, self_recv_type, + MPI_COMM_SELF); +#else + int position = 0; + assert(recv_size[myrank] < INT_MAX); + MPI_Unpack(send_buf[myrank], (int)recv_size[myrank], &position, write_buf, 1, self_recv_type, + MPI_COMM_SELF); +#endif + } + + for (i = 0; i < num_rtypes; i++) + MPI_Type_free(recv_types + i); + NCI_Free(recv_types); + + if (self_recv_type != MPI_DATATYPE_NULL) + MPI_Type_free(&self_recv_type); + +#ifdef HAVE_MPI_STATUSES_IGNORE + statuses = MPI_STATUSES_IGNORE; +#else + statuses = (MPI_Status *) NCI_Malloc(nreqs * sizeof(MPI_Status)); +#endif + +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[4] += MPI_Wtime() - curT; + curT = MPI_Wtime(); +#endif + MPI_Waitall(nreqs, requests, statuses); +#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1) + if (fd->is_agg) fd->write_timing[3] += MPI_Wtime() - curT; +#endif + +#ifndef HAVE_MPI_STATUSES_IGNORE + NCI_Free(statuses); +#endif + NCI_Free(requests); + if (!buf_view.is_contig && nprocs_send) { + NCI_Free(send_buf[0]); + NCI_Free(send_buf); + } + + return err; +} + +#define BUF_INCR \ +{ \ + while (buf_incr) { \ + size_in_buf = MIN(buf_incr, flat_buf_sz); \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + buf_incr -= size_in_buf; \ + if (buf_incr > 0 && flat_buf_sz == 0) { \ + flat_buf_idx++; \ + user_buf_idx = buf_view.off[flat_buf_idx]; \ + flat_buf_sz = buf_view.len[flat_buf_idx]; \ + } \ + } \ +} + +#define BUF_COPY \ +{ \ + while (size) { \ + size_in_buf = MIN(size, flat_buf_sz); \ + memcpy(&(send_buf[p][send_buf_idx[p]]), \ + ((char *) buf) + user_buf_idx, size_in_buf); \ + send_buf_idx[p] += size_in_buf; \ + user_buf_idx += size_in_buf; \ + flat_buf_sz -= size_in_buf; \ + size -= size_in_buf; \ + buf_incr -= size_in_buf; \ + if (size > 0 && flat_buf_sz == 0) { \ + flat_buf_idx++; \ + user_buf_idx = buf_view.off[flat_buf_idx]; \ + flat_buf_sz = buf_view.len[flat_buf_idx]; \ + } \ + } \ + BUF_INCR \ +} + +static +void Fill_send_buffer(PNCIO_File *fd, void *buf, + PNCIO_View buf_view, char **send_buf, + MPI_Count * send_size, + MPI_Request * requests, MPI_Count * sent_to_proc, + int nprocs, int myrank, + MPI_Offset min_st_offset, MPI_Offset fd_size, + MPI_Offset * fd_start, MPI_Offset * fd_end, + MPI_Count * send_buf_idx, MPI_Count * curr_to_proc, + MPI_Count * done_to_proc, int iter) +{ +/* this function is only called if buftype is not contig */ + + int p, jj; + MPI_Offset flat_buf_idx, flat_buf_sz, size_in_buf, buf_incr, size; + MPI_Offset off, len, rem_len, user_buf_idx; + +/* curr_to_proc[p] = amount of data sent to proc. p that has already + been accounted for so far + done_to_proc[p] = amount of data already sent to proc. p in + previous iterations + user_buf_idx = current location in user buffer + send_buf_idx[p] = current location in send_buf of proc. p */ + + for (MPI_Count i = 0; i < nprocs; i++) { + send_buf_idx[i] = curr_to_proc[i] = 0; + done_to_proc[i] = sent_to_proc[i]; + } + jj = 0; + + user_buf_idx = buf_view.off[0]; + flat_buf_idx = 0; + flat_buf_sz = buf_view.len[0]; + + /* flat_buf_idx = current index into flattened buftype + * flat_buf_sz = size of current contiguous component in + * flattened buf */ + + for (MPI_Count i = 0; i < fd->flat_file.count; i++) { + off = fd->flat_file.off[i]; + rem_len = fd->flat_file.len[i]; + + /*this request may span the file domains of more than one process */ + while (rem_len != 0) { + len = rem_len; + /* NOTE: len value is modified by PNCIO_Calc_aggregator() to be no + * longer than the single region that processor "p" is responsible + * for. + */ + p = PNCIO_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_end); + + if (send_buf_idx[p] < send_size[p]) { + if (curr_to_proc[p] + len > done_to_proc[p]) { + if (done_to_proc[p] > curr_to_proc[p]) { + size = MIN(curr_to_proc[p] + len - + done_to_proc[p], send_size[p] - send_buf_idx[p]); + buf_incr = done_to_proc[p] - curr_to_proc[p]; + BUF_INCR + buf_incr = curr_to_proc[p] + len - done_to_proc[p]; + /* ok to cast: bounded by cb buffer size */ + curr_to_proc[p] = done_to_proc[p] + size; + BUF_COPY + } else { + size = MIN(len, send_size[p] - send_buf_idx[p]); + buf_incr = len; + curr_to_proc[p] += size; + BUF_COPY + } + if (send_buf_idx[p] == send_size[p] && p != myrank) { +#if MPI_VERSION >= 4 + MPI_Isend_c(send_buf[p], send_size[p], MPI_BYTE, p, + 0, fd->comm, &requests[jj++]); +#else + MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p, + 0, fd->comm, &requests[jj++]); +#endif + } + } else { + curr_to_proc[p] += len; + buf_incr = len; + BUF_INCR + } + } else { + buf_incr = len; + BUF_INCR + } + off += len; + rem_len -= len; + } + } + for (int i = 0; i < nprocs; i++) { + if (send_size[i]) { + sent_to_proc[i] = curr_to_proc[i]; + } + } +} diff --git a/src/drivers/pncio/pncio_write_str.c b/src/drivers/pncio/pncio_write_str.c new file mode 100644 index 000000000..cb4ac25e8 --- /dev/null +++ b/src/drivers/pncio/pncio_write_str.c @@ -0,0 +1,328 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +#define BUFFERED_WRITE { \ + if (req_off >= writebuf_off + writebuf_len) { \ + if (writebuf_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, \ + writebuf_off); \ + if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (w_len < 0) goto fn_exit; \ + total_w_len += w_len; \ + } \ + writebuf_off = req_off; \ + writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1); \ + if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (r_len < 0) goto fn_exit; \ + } \ + write_sz = (MPI_Aint)MIN(req_len, writebuf_off+writebuf_len-req_off); \ + memcpy(writebuf+req_off-writebuf_off, (char*)buf +userbuf_off, write_sz); \ + while (write_sz != req_len) { \ + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (w_len < 0) goto fn_exit; \ + total_w_len += w_len; \ + req_len -= write_sz; \ + userbuf_off += write_sz; \ + writebuf_off += writebuf_len; \ + writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1); \ + if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE) \ + PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off); \ + if (r_len < 0) goto fn_exit; \ + write_sz = MIN(req_len, writebuf_len); \ + memcpy(writebuf, (char *)buf + userbuf_off, write_sz); \ + } \ +} + + +MPI_Offset PNCIO_GEN_WriteStrided(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + +/* offset is in units of etype relative to the filetype. */ + + char *writebuf = NULL; + int i, j, k, st_index = 0; + MPI_Aint writebuf_len, max_bufsize, write_sz, bufsize; + MPI_Offset i_offset, sum, num, size, abs_off_in_filetype=0; + MPI_Offset userbuf_off, off, req_off, disp, end_offset=0; + MPI_Offset writebuf_off, start_off, new_bwr_size, new_fwr_size; + MPI_Offset st_fwr_size, fwr_size = 0, bwr_size, req_len; + MPI_Offset r_len, w_len, total_w_len=0; + + /* Contiguous both in buftype and filetype should have been handled in a + * call to PNCIO_WriteContig() earlier. + */ + assert(!(buf_view.is_contig && fd->flat_file.is_contig)); + + if (fd->hints->ds_write == PNCIO_HINT_DISABLE) { + /* If user has disabled data sieving on reads, use naive approach + * instead. + */ + return PNCIO_GEN_WriteStrided_naive(fd, buf, buf_view, offset); + } + +// printf("%s at %d: offset=%lld\n",__func__,__LINE__, offset); + +/* PnetCDF always set these 3 conditions */ +assert(fd->filetype == MPI_BYTE); +assert(fd->flat_file.size == buf_view.size); +if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */ + + bufsize = buf_view.size; + + /* get max_bufsize from the info object. */ + max_bufsize = fd->hints->ind_wr_buffer_size; + + if (!buf_view.is_contig && fd->flat_file.is_contig) { + /* noncontiguous in memory, contiguous in file. */ + + off = fd->disp + offset; +assert(fd->disp == 0); + if (fd->flat_file.count > 0) off += fd->flat_file.off[0]; + + start_off = off; + end_offset = off + bufsize - 1; + writebuf_off = off; + writebuf = (char *) NCI_Malloc(max_bufsize); + writebuf_len = MIN(max_bufsize, end_offset - writebuf_off + 1); + + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed */ + if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + for (i = 0; i < buf_view.count; i++) { + userbuf_off = buf_view.off[i]; + req_off = off; + req_len = buf_view.len[i]; + + /* BUFFERED_WRITE_WITHOUT_READ does neither read-modify-write nor + * file lock + */ + if (req_off >= writebuf_off + writebuf_len) { + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, + writebuf_off); + if (w_len < 0) goto fn_exit; + total_w_len += w_len; + writebuf_off = req_off; + writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1); + } + write_sz = MIN(req_len, writebuf_off + writebuf_len - req_off); + memcpy(writebuf+req_off-writebuf_off, (char*)buf +userbuf_off, + write_sz); + while (write_sz != req_len) { + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, + writebuf_off); + if (w_len < 0) goto fn_exit; + total_w_len += w_len; + req_len -= write_sz; + userbuf_off += write_sz; + writebuf_off += writebuf_len; + writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1); + write_sz = MIN(req_len, writebuf_len); + memcpy(writebuf, (char *)buf + userbuf_off, write_sz); + } + + off += buf_view.len[i]; + } + + /* write the buffer out finally */ + if (writebuf_len) { + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); + if (w_len >= 0) total_w_len += w_len; + } + else + w_len = 0; + + if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + if (w_len < 0) + goto fn_exit; + } + else { /* noncontiguous in file */ + MPI_Offset size_in_filetype = offset; + + disp = fd->disp; +assert(fd->disp == 0); + + sum = 0; + for (i = 0; i < fd->flat_file.count; i++) { + sum += fd->flat_file.len[i]; + if (sum > size_in_filetype) { + st_index = i; + fwr_size = sum - size_in_filetype; + abs_off_in_filetype = fd->flat_file.off[i] + + size_in_filetype - (sum - fd->flat_file.len[i]); + break; + } + } + + /* abs. offset in bytes in the file */ + offset = disp + abs_off_in_filetype; + + start_off = offset; +assert(offset == abs_off_in_filetype); + +// printf("%s at %d: start_off=%lld abs_off_in_filetype=%lld\n",__func__,__LINE__,start_off,abs_off_in_filetype); + + /* Write request is within single flat_file contig block. This could + * happen, for example, with subarray types that are actually fairly + * contiguous. + */ + if (buf_view.is_contig && bufsize <= fwr_size) { + /* though MPI api has an integer 'count' parameter, derived + * datatypes might describe more bytes than can fit into an integer. + * if we've made it this far, we can pass a count of original + * datatypes, instead of a count of bytes (which might overflow) + * Other WriteContig calls in this path are operating on data + * sieving buffer */ + PNCIO_WRITE_LOCK(fd, offset, SEEK_SET, bufsize); + w_len = PNCIO_WriteContig(fd, buf, buf_view.size, offset); + if (w_len > 0) total_w_len += w_len; + PNCIO_UNLOCK(fd, offset, SEEK_SET, bufsize); + + goto fn_exit; + } + + /* Calculate end_offset, the last byte-offset that will be accessed. + * e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */ + + st_fwr_size = fwr_size; + j = st_index; + fwr_size = MIN(fwr_size, bufsize); + i_offset = fwr_size; + end_offset = offset + fwr_size - 1; + while (i_offset < bufsize) { + j++; + fwr_size = MIN(fd->flat_file.len[j], bufsize - i_offset); + i_offset += fwr_size; + end_offset = disp + fd->flat_file.off[j] + fwr_size - 1; + } + + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed */ + if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + writebuf_off = 0; + writebuf_len = 0; + writebuf = (char *) NCI_Malloc(max_bufsize); + memset(writebuf, -1, max_bufsize); + + if (buf_view.is_contig && !fd->flat_file.is_contig) { + /* contiguous in memory, noncontiguous in file should be the most + * common case. + */ + i_offset = 0; + j = st_index; + off = offset; + fwr_size = MIN(st_fwr_size, bufsize); + while (i_offset < bufsize) { + if (fwr_size) { + req_off = off; + req_len = fwr_size; + userbuf_off = i_offset; + BUFFERED_WRITE; + } + + i_offset += fwr_size; + if (i_offset >= bufsize) break; + + if (off + fwr_size < disp + fd->flat_file.off[j] + + fd->flat_file.len[j]) + off += fwr_size; /* off is incremented by fwr_size. */ + else { + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + fwr_size = MIN(fd->flat_file.len[j], + bufsize - i_offset); + } + } + } else { + /* noncontiguous in memory as well as in file */ + k = num = 0; + i_offset = buf_view.off[0]; + j = st_index; + off = offset; + fwr_size = st_fwr_size; + bwr_size = buf_view.len[0]; + + while (num < bufsize) { + size = MIN(fwr_size, bwr_size); + if (size) { + req_off = off; + req_len = size; + userbuf_off = i_offset; + BUFFERED_WRITE; + } + + num += size; + if (num >= bufsize) break; + + new_fwr_size = fwr_size; + new_bwr_size = bwr_size; + + if (size == fwr_size) { + j++; +assert(j < fd->flat_file.count); + off = disp + fd->flat_file.off[j]; + new_fwr_size = fd->flat_file.len[j]; + if (size != bwr_size) { + i_offset += size; + new_bwr_size -= size; + } + } + + if (size == bwr_size) { + /* reached end of contiguous block in memory */ + + k++; +assert(k < buf_view.count); + i_offset = buf_view.off[k]; + new_bwr_size = buf_view.len[k]; + if (size != fwr_size) { + off += size; + new_fwr_size -= size; + } + } + fwr_size = new_fwr_size; + bwr_size = new_bwr_size; + } + } + + /* write the buffer out finally */ + if (writebuf_len) { + w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off); + if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); + if (w_len < 0) goto fn_exit; + total_w_len += w_len; + } + if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + } + +fn_exit: + if (writebuf != NULL) + NCI_Free(writebuf); + + return total_w_len; +} diff --git a/src/drivers/pncio/pncio_write_str_naive.c b/src/drivers/pncio/pncio_write_str_naive.c new file mode 100644 index 000000000..572ed855b --- /dev/null +++ b/src/drivers/pncio/pncio_write_str_naive.c @@ -0,0 +1,245 @@ +/* + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +MPI_Offset PNCIO_GEN_WriteStrided_naive(PNCIO_File *fd, + const void *buf, + PNCIO_View buf_view, + MPI_Offset offset) +{ + int b_index; + MPI_Count bufsize; + + /* bwr == buffer write; fwr == file write */ + MPI_Offset bwr_size, fwr_size = 0, sum, size_in_filetype, size; + MPI_Offset abs_off_in_filetype = 0, req_len, userbuf_off; + MPI_Offset off, req_off, disp, end_offset = 0, start_off; + MPI_Offset w_len, total_w_len=0; + +/* PnetCDF always sets fd->filetype == MPI_BYTE */ +assert(fd->filetype == MPI_BYTE); + + /* Contiguous both in buftype and filetype should have been handled in a + * call to PNCIO_WriteContig() earlier. + */ + assert(!(buf_view.is_contig && fd->flat_file.is_contig)); + + bufsize = buf_view.size; + + if (!buf_view.is_contig && fd->flat_file.is_contig) { + /* noncontiguous in memory, contiguous in file. */ + + off = fd->disp + offset; + + start_off = off; + end_offset = off + bufsize - 1; + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + /* for each region in the buffer, grab the data and put it in place */ + for (b_index = 0; b_index < buf_view.count; b_index++) { + userbuf_off = buf_view.off[b_index]; + req_off = off; + req_len = buf_view.len[b_index]; + + w_len = PNCIO_WriteContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (w_len < 0) return w_len; + total_w_len += w_len; + + /* off is (potentially) used to save the final offset later */ + off += buf_view.len[b_index]; + } + + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + } + else { /* noncontiguous in file */ + int f_index, st_index = 0; + MPI_Offset st_fwr_size; + + /* First we're going to calculate a set of values for use in all + * the noncontiguous in file cases: + * start_off - starting byte position of data in file + * end_offset - last byte offset to be accessed in the file + * st_index - index of block in first filetype that we will be + * starting in (?) + * st_fwr_size - size of the data in the first filetype block + * that we will write (accounts for being part-way + * into writing this block of the filetype + */ + + disp = fd->disp; + + size_in_filetype = offset; + + sum = 0; + for (f_index = 0; f_index < fd->flat_file.count; f_index++) { + sum += fd->flat_file.len[f_index]; + if (sum > size_in_filetype) { + st_index = f_index; + fwr_size = sum - size_in_filetype; + abs_off_in_filetype = fd->flat_file.off[f_index] + + size_in_filetype - (sum - fd->flat_file.len[f_index]); + break; + } + } + + /* abs. offset in bytes in the file */ + start_off = disp + abs_off_in_filetype; + + st_fwr_size = fwr_size; + + /* start_off, st_index, and st_fwr_size are + * all calculated at this point + */ + + /* Calculate end_offset, the last byte-offset that will be accessed. + * e.g., if start_off=0 and 100 bytes to be written, end_offset=99 + */ + f_index = st_index; + fwr_size = MIN(st_fwr_size, bufsize); + userbuf_off = fwr_size; + end_offset = start_off + fwr_size - 1; + while (userbuf_off < bufsize) { + f_index++; + fwr_size = MIN(fd->flat_file.len[f_index], + bufsize - userbuf_off); + userbuf_off += fwr_size; + end_offset = disp + fd->flat_file.off[f_index] + fwr_size - 1; + } + + /* End of calculations. At this point the following values have + * been calculated and are ready for use: + * - start_off + * - end_offset + * - st_index + * - st_fwr_size + */ + + /* if atomicity is true, lock (exclusive) the region to be accessed */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + + if (buf_view.is_contig && !fd->flat_file.is_contig) { + /* contiguous in memory, noncontiguous in file. should be the + * most common case. + */ + + userbuf_off = 0; + f_index = st_index; + off = start_off; + fwr_size = MIN(st_fwr_size, bufsize); + + /* while there is still space in the buffer, write more data */ + while (userbuf_off < bufsize) { + if (fwr_size) { + /* TYPE_UB and TYPE_LB can result in + * fwr_size = 0. save system call in such cases */ + req_off = off; + req_len = fwr_size; + + w_len = PNCIO_WriteContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (w_len < 0) return w_len; + total_w_len += w_len; + } + userbuf_off += fwr_size; + if (userbuf_off >= bufsize) break; + + if (off + fwr_size < disp + fd->flat_file.off[f_index] + + fd->flat_file.len[f_index]) { + /* important that this value be correct, as it is + * used to set the offset in the fd near the end of + * this function. + */ + off += fwr_size; + } + /* did not reach end of contiguous block in filetype. + * no more I/O needed. off is incremented by fwr_size. + */ + else { + f_index++; +assert(f_index < fd->flat_file.count); + off = disp + fd->flat_file.off[f_index]; + fwr_size = MIN(fd->flat_file.len[f_index], + bufsize - userbuf_off); + } + } + } else { + MPI_Offset i_offset, tmp_bufsize = 0; + /* noncontiguous in memory as well as in file */ + + b_index = 0; + i_offset = buf_view.off[0]; + f_index = st_index; + off = start_off; + fwr_size = st_fwr_size; + bwr_size = buf_view.len[0]; + + /* while we haven't read size * count bytes, keep going */ + while (tmp_bufsize < bufsize) { + MPI_Offset new_bwr_size = bwr_size, new_fwr_size = fwr_size; + + size = MIN(fwr_size, bwr_size); + /* keep max of a single read amount <= INT_MAX */ + size = MIN(size, INT_MAX); + + if (size) { + req_off = off; + req_len = size; + userbuf_off = i_offset; + + w_len = PNCIO_WriteContig(fd, (char *) buf + userbuf_off, + req_len, req_off); + if (w_len < 0) return w_len; + total_w_len += w_len; + } + + if (tmp_bufsize >= bufsize) break; + tmp_bufsize += size; + + if (size == fwr_size) { + f_index++; +assert(f_index < fd->flat_file.count); + off = disp + fd->flat_file.off[f_index]; + new_fwr_size = fd->flat_file.len[f_index]; + if (size != bwr_size) { + i_offset += size; + new_bwr_size -= size; + } + } + + if (size == bwr_size) { + /* reached end of contiguous block in memory */ + b_index++; +assert(b_index < buf_view.count); + i_offset = buf_view.off[b_index]; + new_bwr_size = buf_view.len[b_index]; + if (size != fwr_size) { + off += size; + new_fwr_size -= size; + } + } + fwr_size = new_fwr_size; + bwr_size = new_bwr_size; + } + } + + /* unlock the file region if we locked it */ + if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS)) + PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); + + } /* end of (else noncontiguous in file) */ + + return total_w_len; +} diff --git a/src/include/pnc_debug.h b/src/include/pnc_debug.h index 976244896..fcee81633 100644 --- a/src/include/pnc_debug.h +++ b/src/include/pnc_debug.h @@ -55,6 +55,27 @@ } \ return err; \ } +#define DEBUG_FOPEN_ERROR(err) { \ + if (ncp->ina_comm != MPI_COMM_NULL) MPI_Comm_free(&ncp->ina_comm); \ + char *_env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE"); \ + if (_env_str != NULL && *_env_str != '0') { \ + int _rank; \ + MPI_Comm_rank(MPI_COMM_WORLD, &_rank); \ + fprintf(stderr, "Rank %d: %s error at line %d of %s in %s\n", \ + _rank,ncmpi_strerrno(err),__LINE__,__func__,__FILE__); \ + } \ + return err; \ +} +#define DEBUG_RETURN_ERROR_MSG(err, msg) { \ + char *_env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE"); \ + if (_env_str != NULL && *_env_str != '0') { \ + int _rank; \ + MPI_Comm_rank(MPI_COMM_WORLD, &_rank); \ + fprintf(stderr, "Rank %d: %s error at line %d of %s in %s (%s)\n", \ + _rank,ncmpi_strerrno(err),__LINE__,__func__,__FILE__, msg); \ + } \ + return err; \ +} #define DEBUG_ASSIGN_ERROR(status, err) { \ char *_env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE"); \ if (_env_str != NULL && *_env_str != '0') { \ @@ -76,6 +97,11 @@ } #else #define DEBUG_RETURN_ERROR(err) return err; +#define DEBUG_RETURN_ERROR_MSG(err, msg) return err; +#define DEBUG_FOPEN_ERROR(err) { \ + if (ncp->ina_comm != MPI_COMM_NULL) MPI_Comm_free(&ncp->ina_comm); \ + return err; \ +} #define DEBUG_ASSIGN_ERROR(status, err) status = err; #define DEBUG_TRACE_ERROR(err) #endif diff --git a/src/include/pnetcdf.h.in b/src/include/pnetcdf.h.in index b363276b6..df7e9f66a 100644 --- a/src/include/pnetcdf.h.in +++ b/src/include/pnetcdf.h.in @@ -16,6 +16,7 @@ #define PNETCDF_VERSION_MAJOR @PNETCDF_VERSION_MAJOR@ #define PNETCDF_VERSION_MINOR @PNETCDF_VERSION_MINOR@ #define PNETCDF_VERSION_SUB @PNETCDF_VERSION_SUB@ +#define PNETCDF_VERSION_PRE "@PNETCDF_VERSION_PRE@" #define PNETCDF_RELEASE_DATE "DIST_DATE" /* List of PnetCDF features enabled/disabled at configure time. @@ -657,6 +658,7 @@ by the desired type. */ #define NC_EBADLOG (-238) /**< Unrecognized log file format */ #define NC_EFLUSHED (-239) /**< Nonblocking request has already been flushed. It is too late to cancel */ #define NC_EADIOS (-240) /**< unknown ADIOS error */ +#define NC_EFSTYPE (-241) /**< Invalid file system type */ /* add new error here */ /* header inconsistency errors start from -250 */ diff --git a/src/libs/Makefile.am b/src/libs/Makefile.am index a932f20f5..17ea3ed75 100644 --- a/src/libs/Makefile.am +++ b/src/libs/Makefile.am @@ -23,6 +23,7 @@ libpnetcdf_la_SOURCES = libpnetcdf_la_LIBADD += ../dispatchers/libdispatchers.la libpnetcdf_la_LIBADD += ../drivers/common/libcommon.la libpnetcdf_la_LIBADD += ../drivers/ncmpio/libncmpio.la +libpnetcdf_la_LIBADD += ../drivers/pncio/libpncio.la if BUILD_DRIVER_FOO libpnetcdf_la_LIBADD += ../drivers/ncfoo/libncfoo.la endif @@ -71,6 +72,9 @@ endif ../drivers/ncmpio/libncmpio.la: set -e; cd ../drivers/ncmpio && $(MAKE) $(MFLAGS) +../drivers/pncio/libpncio.la: + set -e; cd ../drivers/pncio && $(MAKE) $(MFLAGS) + ../drivers/ncncio/libncncio.la: set -e; cd ../drivers/ncncio && $(MAKE) $(MFLAGS) diff --git a/src/utils/ncmpidiff/cdfdiff.c b/src/utils/ncmpidiff/cdfdiff.c index be19bc0c7..73426fc7c 100644 --- a/src/utils/ncmpidiff/cdfdiff.c +++ b/src/utils/ncmpidiff/cdfdiff.c @@ -187,9 +187,9 @@ struct vspec { /*----< get_var_names() >-----------------------------------------------------*/ static void -get_var_names(char *optarg, struct vspec* vspecp) +get_var_names(char *opt_arg, struct vspec* vspecp) { - char *cp=optarg, **cpp; + char *cp=opt_arg, **cpp; int nvars = 1; /* compute number of variable names in comma-delimited list */ @@ -203,7 +203,7 @@ get_var_names(char *optarg, struct vspec* vspecp) cpp = vspecp->names; /* copy variable names into list */ - for (cp = strtok(optarg, ","); + for (cp = strtok(opt_arg, ","); cp != NULL; cp = strtok((char *) NULL, ",")) { @@ -237,11 +237,11 @@ get_type(int type) /*----< main() >--------------------------------------------------------------*/ int main(int argc, char **argv) { + /* int verbose; is defined as a locally global variable in ncvalidator.c */ extern char *optarg; extern int optind; - char *str, *ptr; size_t nbytes; - int i, j, k, m, n, c, err, verbose, quiet, isDiff; + int i, j, k, m, n, c, err, quiet, isDiff; int fd[2], nvars[2], ndims[2], nattrs[2], check_tolerance; int cmp_nvars, check_header, check_variable_list, check_entire_file; long long numVarDIFF=0, numHeadDIFF=0, numDIFF; @@ -264,7 +264,8 @@ int main(int argc, char **argv) var_list.nvars = 0; check_tolerance = 0; - while ((c = getopt(argc, argv, "bhqv:t:")) != -1) + while ((c = getopt(argc, argv, "bhqv:t:")) != -1) { + char *str, *ptr; switch(c) { case 'h': /* compare header only */ check_header = 1; @@ -301,6 +302,7 @@ int main(int argc, char **argv) usage(argv[0]); break; } + } /* quiet mode overwrites verbose */ if (quiet) verbose = 0; diff --git a/src/utils/ncmpidiff/ncmpidiff.c b/src/utils/ncmpidiff/ncmpidiff.c index 8f1fe4f20..6eed1bb98 100644 --- a/src/utils/ncmpidiff/ncmpidiff.c +++ b/src/utils/ncmpidiff/ncmpidiff.c @@ -60,6 +60,15 @@ } \ } +#define HANDLE_FILE_ERR(filename) { \ + if (err != NC_NOERR) { \ + fprintf(stderr, "Error at line %d: input file %s (%s)\n", __LINE__, \ + filename, ncmpi_strerror(err)); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + exit(-1); \ + } \ +} + #define CHECK_GLOBAL_ATT_DIFF_CHAR { \ int pos; \ char *b1 = (char *)calloc((attlen[0] + 1) * 2, sizeof(char)); \ @@ -300,9 +309,9 @@ struct vspec { /*----< get_var_names() >-----------------------------------------------------*/ static void -get_var_names(char *optarg, struct vspec* vspecp) +get_var_names(char *opt_arg, struct vspec* vspecp) { - char *cp=optarg, **cpp; + char *cp=opt_arg, **cpp; int nvars = 1; /* compute number of variable names in comma-delimited list */ @@ -316,7 +325,7 @@ get_var_names(char *optarg, struct vspec* vspecp) cpp = vspecp->names; /* copy variable names into list */ - for (cp = strtok(optarg, ","); + for (cp = strtok(opt_arg, ","); cp != NULL; cp = strtok((char *) NULL, ",")) { @@ -450,7 +459,7 @@ int main(int argc, char **argv) /* file format version */ err = ncmpi_inq_file_format(argv[optind+i], &fmt[i]); - HANDLE_ERROR + HANDLE_FILE_ERR(argv[optind+i]) if (fmt[i] == NC_FORMAT_NETCDF4 || fmt[i] == NC_FORMAT_NETCDF4_CLASSIC) { #ifndef ENABLE_NETCDF4 diff --git a/src/utils/ncmpidump/ncmpidump.c b/src/utils/ncmpidump/ncmpidump.c index be3d72482..77829ef18 100644 --- a/src/utils/ncmpidump/ncmpidump.c +++ b/src/utils/ncmpidump/ncmpidump.c @@ -51,9 +51,9 @@ static void pr_att_string(size_t len, const char* string); static void pr_att_vals(nc_type type, size_t len, const double* vals); static void pr_att(int ncid, int varid, const char *varname, int ia); static void do_ncdump(const char* path, struct fspec* specp); -static void make_lvars(char* optarg, struct fspec* fspecp); -static void set_sigdigs( const char* optarg); -static void set_precision( const char *optarg); +static void make_lvars(char* opt_arg, struct fspec* fspecp); +static void set_sigdigs( const char* opt_arg); +static void set_precision( const char *opt_arg); int main(int argc, char** argv); #define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0) @@ -611,9 +611,9 @@ do_ncdump(const char *path, struct fspec* specp) static void -make_lvars(char *optarg, struct fspec* fspecp) +make_lvars(char *opt_arg, struct fspec* fspecp) { - char *cp = optarg; + char *cp = opt_arg; int nvars = 1; char ** cpp; @@ -628,7 +628,7 @@ make_lvars(char *optarg, struct fspec* fspecp) cpp = fspecp->lvars; /* copy variable names into list */ - for (cp = strtok(optarg, ","); + for (cp = strtok(opt_arg, ","); cp != NULL; cp = strtok((char *) NULL, ",")) { @@ -647,15 +647,15 @@ make_lvars(char *optarg, struct fspec* fspecp) * command-line and update the default data formats appropriately. */ static void -set_sigdigs(const char *optarg) +set_sigdigs(const char *opt_arg) { char *ptr1 = 0; char *ptr2 = 0; int flt_digits = FLT_DIGITS; /* default floating-point digits */ int dbl_digits = DBL_DIGITS; /* default double-precision digits */ - if (optarg != 0 && (int) strlen(optarg) > 0 && optarg[0] != ',') - flt_digits = (int)strtol(optarg, &ptr1, 10); + if (opt_arg != 0 && (int) strlen(opt_arg) > 0 && opt_arg[0] != ',') + flt_digits = (int)strtol(opt_arg, &ptr1, 10); if (flt_digits < 1 || flt_digits > 20) error("unreasonable value for float significant digits: %d", @@ -679,15 +679,15 @@ set_sigdigs(const char *optarg) * and update the default data formats appropriately. */ static void -set_precision(const char *optarg) +set_precision(const char *opt_arg) { char *ptr1 = 0; char *ptr2 = 0; int flt_digits = FLT_DIGITS; /* default floating-point digits */ int dbl_digits = DBL_DIGITS; /* default double-precision digits */ - if (optarg != 0 && (int) strlen(optarg) > 0 && optarg[0] != ',') { - flt_digits = (int)strtol(optarg, &ptr1, 10); + if (opt_arg != 0 && (int) strlen(opt_arg) > 0 && opt_arg[0] != ',') { + flt_digits = (int)strtol(opt_arg, &ptr1, 10); float_precision_specified = 1; } diff --git a/src/utils/ncmpidump/vardata.c b/src/utils/ncmpidump/vardata.c index 11310a9f9..b8cec8b99 100644 --- a/src/utils/ncmpidump/vardata.c +++ b/src/utils/ncmpidump/vardata.c @@ -82,9 +82,9 @@ static double double_eps; static float float_epsilon(void) { - float float_eps; + float val_float_eps; #ifndef NO_FLOAT_H - float_eps = FLT_EPSILON; + val_float_eps = FLT_EPSILON; #else /* NO_FLOAT_H */ { float etop, ebot, eps; @@ -103,19 +103,19 @@ float_epsilon(void) ebot = eps; eps = ebot + (etop - ebot)/two; } - float_eps = two * etop; + val_float_eps = two * etop; } #endif /* NO_FLOAT_H */ - return float_eps; + return val_float_eps; } static double double_epsilon(void) { - double double_eps; + double val_double_eps; #ifndef NO_FLOAT_H - double_eps = DBL_EPSILON; + val_double_eps = DBL_EPSILON; #else /* NO_FLOAT_H */ { double etop, ebot, eps; @@ -134,10 +134,10 @@ double_epsilon(void) ebot = eps; eps = ebot + (etop - ebot)/two; } - double_eps = two * etop; + val_double_eps = two * etop; } #endif /* NO_FLOAT_H */ - return double_eps; + return val_double_eps; } diff --git a/src/utils/ncmpigen/genlib.c b/src/utils/ncmpigen/genlib.c index 1e558a92f..26ac82d89 100644 --- a/src/utils/ncmpigen/genlib.c +++ b/src/utils/ncmpigen/genlib.c @@ -1548,7 +1548,6 @@ cl_fortran(void) } fline(stmnt); if (v->type != NC_CHAR) { - char *sp; sprintf(stmnt, "%s %s(", ncftype(v->type), v->lname); /* reverse dimensions for FORTRAN */ @@ -1582,12 +1581,12 @@ cl_fortran(void) if (v->has_data) { fline(v->data_stmnt); } else { /* generate data statement for FILL record */ - MPI_Offset rec_len = 1; + MPI_Offset rec_size = 1; for (idim = 1; idim < v->ndims; idim++) { - rec_len *= dims[v->dims[idim]].size; + rec_size *= dims[v->dims[idim]].size; } sprintf(stmnt,"data %s /%lu * %s/", v->lname, - (unsigned long) rec_len, + (unsigned long) rec_size, f_fill_name(v->type)); fline(stmnt); } @@ -1695,9 +1694,9 @@ close_netcdf(void) void -check_err(int stat, const char *ncmpi_func, const char *calling_func, int lineno, const char *calling_file) { +check_err(int stat, const char *ncmpi_func, const char *calling_func, int linenum, const char *calling_file) { if (stat != NC_NOERR) { - fprintf(stderr, "ncmpigen error when calling %s in %s() at line %d of %s: %s\n", ncmpi_func, calling_func, lineno, calling_file, ncmpi_strerror(stat)); + fprintf(stderr, "ncmpigen error when calling %s in %s() at line %d of %s: %s\n", ncmpi_func, calling_func, linenum, calling_file, ncmpi_strerror(stat)); derror_count++; } } diff --git a/src/utils/ncmpigen/load.c b/src/utils/ncmpigen/load.c index 69fe54e3a..788450aa2 100644 --- a/src/utils/ncmpigen/load.c +++ b/src/utils/ncmpigen/load.c @@ -394,7 +394,7 @@ fstrcat( */ static void f_var_init( - int varnum, /* which variable */ + int varid, /* which variable */ void *rec_start /* start of data */ ) { @@ -415,9 +415,9 @@ f_var_init( int ival; /* load variable with data values */ - sprintf(stmnt, "data %s /",vars[varnum].lname); + sprintf(stmnt, "data %s /",vars[varid].lname); stmnt_len = strlen(stmnt); - switch (vars[varnum].type) { + switch (vars[varid].type) { case NC_BYTE: charvalp = (char *) rec_start; for (ival = 0; ival < var_len-1; ival++) { @@ -524,10 +524,10 @@ f_var_init( /* For record variables, store data statement for later use; otherwise, just print it. */ - if (vars[varnum].ndims > 0 && vars[varnum].dims[0] == rec_dim) { + if (vars[varid].ndims > 0 && vars[varid].dims[0] == rec_dim) { char *dup_stmnt = (char*) emalloc(strlen(stmnt)+1); strcpy(dup_stmnt, stmnt); /* ULTRIX missing strdup */ - vars[varnum].data_stmnt = dup_stmnt; + vars[varid].data_stmnt = dup_stmnt; } else { fline(stmnt); } diff --git a/src/utils/ncmpigen/ncmpigentab.c b/src/utils/ncmpigen/ncmpigentab.c index 117e7d494..069986788 100644 --- a/src/utils/ncmpigen/ncmpigentab.c +++ b/src/utils/ncmpigen/ncmpigentab.c @@ -1,6 +1,8 @@ +/* #ifndef lint static const char yysccsid[] = "@(#)yaccpar 1.9 (Berkeley) 02/21/93"; #endif +*/ #include #include @@ -617,7 +619,6 @@ static int yygrowstack(void) #define YYABORT goto yyabort #define YYREJECT goto yyabort #define YYACCEPT goto yyaccept -#define YYERROR goto yyerrlab int yyparse(void) { @@ -686,11 +687,6 @@ yyparse(void) yyerror("syntax error"); -#ifdef lint - goto yyerrlab; -#endif - -yyerrlab: ++yynerrs; yyinrecovery: diff --git a/src/utils/ncoffsets/ncoffsets.c b/src/utils/ncoffsets/ncoffsets.c index 977f199dd..1e2a32972 100644 --- a/src/utils/ncoffsets/ncoffsets.c +++ b/src/utils/ncoffsets/ncoffsets.c @@ -1802,9 +1802,9 @@ struct fspec { }; static void -make_lvars(char *optarg, struct fspec* fspecp) +make_lvars(char *opt_arg, struct fspec* fspecp) { - char *cp = optarg; + char *cp = opt_arg; int nvars = 1; char ** cpp; @@ -1819,7 +1819,7 @@ make_lvars(char *optarg, struct fspec* fspecp) cpp = fspecp->lvars; /* copy variable names into list */ - for (cp = strtok(optarg, ","); + for (cp = strtok(opt_arg, ","); cp != NULL; cp = strtok((char *) NULL, ",")) { @@ -2070,7 +2070,7 @@ int main(int argc, char *argv[]) /* print fixed-size variables first */ if (num_fix_vars) printf("\nfixed-size variables:\n"); for (i=0; inlvars; i++) { - int j, ndims, cdots; + int ndims, cdots; char type_str[16], str[1024], *line; size_t lineLen; long long size; @@ -2162,7 +2162,7 @@ int main(int argc, char *argv[]) /* print record variables */ if (num_rec_vars) printf("\nrecord variables:\n"); for (i=0; inlvars; i++) { - int j, ndims, cdots; + int ndims, cdots; char type_str[16], str[1024], *line; size_t lineLen; long long var_begin, var_end, size, numrecs; diff --git a/src/utils/ncvalidator/ncvalidator.c b/src/utils/ncvalidator/ncvalidator.c index da58bcf6c..078d2abf5 100644 --- a/src/utils/ncvalidator/ncvalidator.c +++ b/src/utils/ncvalidator/ncvalidator.c @@ -1397,7 +1397,7 @@ val_get_NC_attr(int fd, NC_attr **attrpp, const char *loc) { - char *name=NULL, xloc[1024]; + char *name=NULL, xloc[2048]; int err, status=NC_NOERR; size_t err_addr, name_len; nc_type xtype; @@ -2401,7 +2401,7 @@ val_get_NC(int fd, NC *ncp) /* check zero padding in the blank space betwee header size and extent */ if (repair && ncp->begin_var - ncp->xsz > 0) { - size_t i, gap = ncp->begin_var - ncp->xsz; + size_t gap = ncp->begin_var - ncp->xsz; ssize_t readLen; char *buf = (char*) malloc(gap); @@ -2448,7 +2448,7 @@ val_get_NC(int fd, NC *ncp) #ifndef BUILD_CDFDIFF -/* File system types recognized by ROMIO in MPICH 4.0.0 */ +/* File system types recognized by ROMIO in MPICH 4.0.0, and by PnetCDF */ static const char* fstypes[] = {"ufs", "nfs", "xfs", "pvfs2", "gpfs", "panfs", "lustre", "daos", "testfs", "ime", "quobyte", NULL}; /* Return a pointer to filename by removing the file system type prefix name if diff --git a/test/C/Makefile.am b/test/C/Makefile.am index 4d0668b6f..9336c3aec 100644 --- a/test/C/Makefile.am +++ b/test/C/Makefile.am @@ -68,7 +68,7 @@ ptest ptests ptest4: $(TESTPROGRAMS) @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 || exit 1 ptest2 ptest6 ptest8 ptest10: diff --git a/test/C/parallel_run.sh b/test/C/parallel_run.sh index 9fe1f41a8..76d9b9acb 100755 --- a/test/C/parallel_run.sh +++ b/test/C/parallel_run.sh @@ -17,7 +17,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "MPIRUN = ${MPIRUN}" # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -27,17 +27,46 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + if test "$intra_aggr" = 1 ; then + INA_HINTS="nc_num_aggrs_per_node=2" + else + INA_HINTS="nc_num_aggrs_per_node=0" + fi + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" + export PNETCDF_SAFE_MODE=$j + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${MPIRUN} ./pres_temp_4D_wr ${TESTOUTDIR}/pres_temp_4D.nc ${MPIRUN} ./pres_temp_4D_rd ${TESTOUTDIR}/pres_temp_4D.nc # echo "--- validating file ${TESTOUTDIR}/pres_temp_4D.nc" @@ -67,6 +96,7 @@ for intra_aggr in 0 1 ; do fi done done +done rm -f ${OUTDIR}/*.nc rm -f ${OUTDIR}/*.nc4 diff --git a/test/C/pres_temp_4D_rd.c b/test/C/pres_temp_4D_rd.c index 84257c3b0..76078d798 100644 --- a/test/C/pres_temp_4D_rd.c +++ b/test/C/pres_temp_4D_rd.c @@ -191,9 +191,12 @@ int main(int argc, char **argv) } i++; } - } /* next record */ fn_exit: + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + if (nerrs > 0) break; + } /* next record */ + /* Close the file. */ err = ncmpi_close(ncid); CHECK_ERR diff --git a/test/C/seq_runs.sh b/test/C/seq_runs.sh index 1098e2896..9bab67c2c 100755 --- a/test/C/seq_runs.sh +++ b/test/C/seq_runs.sh @@ -23,8 +23,30 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} ./pres_temp_4D_wr ${TESTOUTDIR}/pres_temp_4D.nc ${TESTSEQRUN} ./pres_temp_4D_rd ${TESTOUTDIR}/pres_temp_4D.nc # echo "--- validating file ${TESTOUTDIR}/pres_temp_4D.nc" @@ -54,5 +76,6 @@ for j in ${safe_modes} ; do fi # echo "" done +done rm -f ${OUTDIR}/*.nc rm -f ${OUTDIR}/*.nc4 diff --git a/test/CXX/Makefile.am b/test/CXX/Makefile.am index 532133e05..374be8757 100644 --- a/test/CXX/Makefile.am +++ b/test/CXX/Makefile.am @@ -60,7 +60,7 @@ ptest ptests ptest4: $(TESTPROGRAMS) @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 || exit 1 ptest2 ptest6 ptest8 ptest10: diff --git a/test/CXX/nctst.cpp b/test/CXX/nctst.cpp index 84cd1a4ad..41aca948b 100644 --- a/test/CXX/nctst.cpp +++ b/test/CXX/nctst.cpp @@ -568,7 +568,7 @@ main(int argc, char* argv[]) // test new netCDF interface if (err == NC_NOERR) { MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", + printf("heap memory allocated by PnetCDF internally has " OFFFMT " bytes yet to be freed\n", sum_size); } diff --git a/test/CXX/parallel_run.sh b/test/CXX/parallel_run.sh index 4f887ad27..a7efc1485 100755 --- a/test/CXX/parallel_run.sh +++ b/test/CXX/parallel_run.sh @@ -18,7 +18,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "TESTPROGRAMS=${TESTPROGRAMS}" # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -29,17 +29,46 @@ unset PNETCDF_HINTS for i in ${TESTPROGRAMS} ; do for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" else - export PNETCDF_HINTS= + USEMPIO_HINTS="nc_pncio=enable" fi + for intra_aggr in 0 1 ; do if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + INA_HINTS="nc_num_aggrs_per_node=2" + else + INA_HINTS="nc_num_aggrs_per_node=0" + fi + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc # echo "--- validating file ${TESTOUTDIR}/$i.nc" @@ -67,6 +96,7 @@ for i in ${TESTPROGRAMS} ; do fi done done + done rm -f ${OUTDIR}/$i.nc rm -f ${OUTDIR}/$i.bb.nc done diff --git a/test/CXX/test_classic.cpp b/test/CXX/test_classic.cpp index c600181af..a38182de6 100644 --- a/test/CXX/test_classic.cpp +++ b/test/CXX/test_classic.cpp @@ -88,7 +88,7 @@ int main( int argc, char *argv[] ) if (err == NC_NOERR) { MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD); if (rank == 0 && sum_size > 0) - printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", + printf("heap memory allocated by PnetCDF internally has " OFFFMT " bytes yet to be freed\n", sum_size); } diff --git a/test/CXX/wrap_runs.sh b/test/CXX/wrap_runs.sh index d34ca7b47..49bfef8d2 100755 --- a/test/CXX/wrap_runs.sh +++ b/test/CXX/wrap_runs.sh @@ -26,8 +26,30 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} ./$1 ${TESTOUTDIR}/$outfile.nc ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc # echo "" @@ -36,7 +58,7 @@ for j in ${safe_modes} ; do echo "" echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.bb.nc unset PNETCDF_HINTS ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc @@ -45,6 +67,7 @@ for j in ${safe_modes} ; do ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$outfile.nc ${TESTOUTDIR}/$outfile.bb.nc fi done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.bb.nc diff --git a/test/F90/Makefile.am b/test/F90/Makefile.am index 4d4812767..5249744f9 100644 --- a/test/F90/Makefile.am +++ b/test/F90/Makefile.am @@ -86,28 +86,28 @@ ptest ptest4: $(PARALLEL_PROGS) @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 || exit 1 ptest2: $(PARALLEL_PROGS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 2 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 2 || exit 1 + $(srcdir)/../parallel_run.sh 2 || exit 1 ptest8: $(PARALLEL_PROGS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 8 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 8 || exit 1 + $(srcdir)/../parallel_run.sh 8 || exit 1 ptest10: $(PARALLEL_PROGS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 10 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 10 || exit 1 + $(srcdir)/../parallel_run.sh 10 || exit 1 ptests: ptest2 ptest4 ptest8 ptest10 ptest6: diff --git a/test/F90/f90tst_parallel.f90 b/test/F90/f90tst_parallel.f90 index d6223a334..4af5f9705 100644 --- a/test/F90/f90tst_parallel.f90 +++ b/test/F90/f90tst_parallel.f90 @@ -100,6 +100,9 @@ program f90tst_parallel ! Define the variable. call handle_err(nf90mpi_def_var(ncid, "data", NF90_INT, dimids, varid)) + ! fill with default fill value + call handle_err(nf90mpi_def_var_fill(ncid, varid, 0, NF90_FILL_INT)) + call handle_err(nf90mpi_enddef(ncid)) ! Determine what part of the variable will be written for this diff --git a/test/F90/f90tst_parallel4.f90 b/test/F90/f90tst_parallel4.f90 index b545fa847..70c652bba 100644 --- a/test/F90/f90tst_parallel4.f90 +++ b/test/F90/f90tst_parallel4.f90 @@ -45,10 +45,12 @@ program f90tst call handle_err(nf90mpi_def_dim(fh, 'dim2', 4_MPI_OFFSET_KIND, dimid(2))) call handle_err(nf90mpi_def_dim(fh, 'dim3', 1_MPI_OFFSET_KIND, dimid(3))) - call handle_err(nf90mpi_def_var(fh, 'var1', NF90_DOUBLE, dimid, varid)) - call handle_err(nf90mpi_enddef(fh)) + ! fill with default fill value + call handle_err(nf90mpi_def_var_fill(fh, varid, 0, NF90_FILL_DOUBLE)) + + call handle_err(nf90mpi_enddef(fh)) do i=1,3 f(i) = my_rank*3+i diff --git a/test/F90/f90tst_vars.f90 b/test/F90/f90tst_vars.f90 index 7f43a6908..e114fddf7 100644 --- a/test/F90/f90tst_vars.f90 +++ b/test/F90/f90tst_vars.f90 @@ -39,6 +39,7 @@ program f90tst_vars call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) ! take filename from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then filename = FILE_NAME err = get_args(cmd, filename) @@ -78,6 +79,9 @@ program f90tst_vars ! Define the variable. call handle_err(nf90mpi_def_var(ncid, "data", NF90_INT, dimids, varid)) + ! fill with default fill value + call handle_err(nf90mpi_def_var_fill(ncid, varid, 0, NF90_FILL_INT)) + ! With classic model netCDF-4 file, enddef must be called. call handle_err(nf90mpi_enddef(ncid)) diff --git a/test/F90/f90tst_vars2.f90 b/test/F90/f90tst_vars2.f90 index d2011970e..1defbf9bf 100644 --- a/test/F90/f90tst_vars2.f90 +++ b/test/F90/f90tst_vars2.f90 @@ -24,9 +24,8 @@ program f90tst_vars2 ! We need these ids and other gunk for netcdf. integer :: ncid, varid1, varid2, varid3, varid4, varid5, dimids(MAX_DIMS) - integer :: x_dimid, y_dimid + integer :: x, y, x_dimid, y_dimid, old_fillmode integer :: nvars, ngatts, ndims, unlimdimid, file_format - integer :: x, y integer, parameter :: DEFLATE_LEVEL = 4 integer, parameter :: EightByteInt = selected_int_kind(18) integer (kind = EightByteInt) :: TOE_SAN_VALUE = 2147483648_EightByteInt @@ -53,6 +52,7 @@ program f90tst_vars2 call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) ! take filename from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then filename = FILE_NAME err = get_args(cmd, filename) @@ -94,6 +94,8 @@ program f90tst_vars2 call check(nf90mpi_def_var(ncid, VAR4_NAME, NF90_INT, x_dimid, varid4)) call check(nf90mpi_def_var(ncid, VAR5_NAME, NF90_INT, dimids, varid5)) + call check(nf90mpi_set_fill(ncid, NF90_FILL, old_fillmode)) + call check(nf90mpi_enddef(ncid)) ! enter independent data mode diff --git a/test/F90/f90tst_vars3.f90 b/test/F90/f90tst_vars3.f90 index 88fadc569..7c712bd0f 100644 --- a/test/F90/f90tst_vars3.f90 +++ b/test/F90/f90tst_vars3.f90 @@ -24,9 +24,8 @@ program f90tst_vars3 ! We need these ids and other gunk for netcdf. integer :: ncid, varid1, varid2, varid3, varid4, varid5, dimids(MAX_DIMS) - integer :: x_dimid, y_dimid + integer :: x, y, x_dimid, y_dimid, old_fillmode integer :: nvars, ngatts, ndims, unlimdimid, file_format - integer :: x, y integer, parameter :: DEFAULT_CACHE_NELEMS = 10000, DEFAULT_CACHE_SIZE = 1000000 integer, parameter :: DEFAULT_CACHE_PREEMPTION = 22 integer, parameter :: DEFLATE_LEVEL = 4 @@ -54,6 +53,7 @@ program f90tst_vars3 call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) ! take filename from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then filename = FILE_NAME err = get_args(cmd, filename) @@ -95,6 +95,8 @@ program f90tst_vars3 call check(nf90mpi_def_var(ncid, VAR4_NAME, NF90_INT, x_dimid, varid4)) call check(nf90mpi_def_var(ncid, VAR5_NAME, NF90_INT, dimids, varid5)) + call check(nf90mpi_set_fill(ncid, NF90_FILL, old_fillmode)) + call check(nf90mpi_enddef(ncid)) call check(nf90mpi_begin_indep_data(ncid)) diff --git a/test/F90/f90tst_vars4.f90 b/test/F90/f90tst_vars4.f90 index 1104246e3..6f48a638f 100644 --- a/test/F90/f90tst_vars4.f90 +++ b/test/F90/f90tst_vars4.f90 @@ -22,10 +22,9 @@ program f90tst_vars4 ! We need these ids and other gunk for netcdf. integer :: ncid, varid, dimids(MAX_DIMS) - integer :: x_dimid, y_dimid + integer :: x, y, x_dimid, y_dimid, old_fillmode integer :: mode_flag integer :: nvars, ngatts, ndims, unlimdimid, file_format - integer :: x, y integer, parameter :: CACHE_SIZE = 1000000 integer :: xtype_in, natts_in, dimids_in(MAX_DIMS) character (len = NF90_MAX_NAME) :: name_in @@ -39,6 +38,7 @@ program f90tst_vars4 call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) ! take filename from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then filename = FILE_NAME err = get_args(cmd, filename) @@ -73,6 +73,8 @@ program f90tst_vars4 ! Define the variable. call handle_err(nf90mpi_def_var(ncid, 'data', NF90_INT, dimids, varid)) + call handle_err(nf90mpi_set_fill(ncid, NF90_FILL, old_fillmode)) + ! enddef must be called. call handle_err(nf90mpi_enddef(ncid)) diff --git a/test/F90/parallel_run.sh b/test/F90/parallel_run.sh index da29ea3bc..0e2be8992 100755 --- a/test/F90/parallel_run.sh +++ b/test/F90/parallel_run.sh @@ -18,7 +18,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "PARALLEL_PROGS=${PARALLEL_PROGS}" # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -29,17 +29,47 @@ unset PNETCDF_HINTS for i in ${PARALLEL_PROGS} ; do for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" else - export PNETCDF_HINTS= + USEMPIO_HINTS="nc_pncio=enable" fi + for intra_aggr in 0 1 ; do if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + INA_HINTS="nc_num_aggrs_per_node=2" + else + INA_HINTS="nc_num_aggrs_per_node=0" + fi + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + + # echo "${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc" ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc # echo "--- validating file ${TESTOUTDIR}/$i.nc" @@ -69,6 +99,7 @@ for i in ${PARALLEL_PROGS} ; do fi done done + done rm -f ${OUTDIR}/$i.nc rm -f ${OUTDIR}/$i.bb.nc done diff --git a/test/F90/test_attr_int64.f90 b/test/F90/test_attr_int64.f90 index 19b19183c..d8a821413 100644 --- a/test/F90/test_attr_int64.f90 +++ b/test/F90/test_attr_int64.f90 @@ -38,6 +38,7 @@ program main call MPI_Comm_rank(MPI_COMM_WORLD, rank, err) ! take filename from command-line argument if there is any + cmd = ' ' if (rank .EQ. 0) then filename = 'testfile.nc' err = get_args(cmd, filename) diff --git a/test/F90/test_fill.f90 b/test/F90/test_fill.f90 index a35f52467..c63b1f63e 100644 --- a/test/F90/test_fill.f90 +++ b/test/F90/test_fill.f90 @@ -29,14 +29,16 @@ integer function tst_fmt(filename, mode) implicit none character(LEN=256) filename - integer i, err, ierr, rank + integer i, err, ierr, rank, nprocs integer :: ncid, mode, cmode, dimid(1), varid integer(kind=MPI_OFFSET_KIND) :: start(1) integer(kind=MPI_OFFSET_KIND) :: count(1) + integer(kind=MPI_OFFSET_KIND) :: dim_len integer(kind=MPI_OFFSET_KIND), parameter :: len = 3 integer, parameter :: k = selected_int_kind(18) integer(kind=k) :: buf(len) + call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr) call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) tst_fmt = 0 @@ -51,7 +53,8 @@ integer function tst_fmt(filename, mode) call check(err, 'In nf90mpi_create: ') tst_fmt = tst_fmt + err - err = nf90mpi_def_dim(ncid, "dim", len, dimid(1)) + dim_len = len * nprocs + err = nf90mpi_def_dim(ncid, "dim", dim_len, dimid(1)) call check(err, 'In nf90mpi_def_dim: ') tst_fmt = tst_fmt + err @@ -74,7 +77,7 @@ integer function tst_fmt(filename, mode) tst_fmt = tst_fmt + err ! Write buf - start(1) = 1 + start(1) = len * rank + 1 count(1) = len err = nf90mpi_put_var_all(ncid, varid, buf, start, count) call check(err, 'In nf90mpi_put_var_all: ') @@ -97,6 +100,7 @@ program test call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) ! take filename from command-line argument if there is any + cmd = ' ' if (rank .EQ. 0) then filename = 'testfile.nc' err = get_args(cmd, filename) diff --git a/test/F90/test_intent.f90 b/test/F90/test_intent.f90 index 70612ca86..b5d096b4a 100644 --- a/test/F90/test_intent.f90 +++ b/test/F90/test_intent.f90 @@ -63,6 +63,7 @@ program main call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr) ! take filename from command-line argument if there is any + cmd = ' ' if (rank .EQ. 0) then filename = 'testfile.nc' err = get_args(cmd, filename) @@ -109,34 +110,38 @@ program main call check(err, 'In nfmpi_put_att_int8: ') ! define a variable of an integer array of size 3 in the nc file - err = nfmpi_def_dim(ncid, 'X', 3_MPI_OFFSET_KIND, dimid(1)) - call check(err, 'In nfmpi_def_dim: ') + err = nf90mpi_def_dim(ncid, 'X', 3_MPI_OFFSET_KIND, dimid(1)) + call check(err, 'In nf90mpi_def_dim: ') - err = nfmpi_def_var(ncid, 'var', NF90_INT, 1, dimid, varid) - call check(err, 'In nfmpi_def_var: ') + err = nf90mpi_def_var(ncid, 'var', NF90_INT, dimid, varid) + call check(err, 'In nf90mpi_def_var: ') - err = nfmpi_enddef(ncid) - call check(err, 'In nfmpi_enddef: ') + ! fill with default fill value + err = nf90mpi_def_var_fill(ncid, varid, 0, NF90_FILL_INT) + call check(err, 'In nf90mpi_def_var_fill: ') + + err = nf90mpi_enddef(ncid) + call check(err, 'In nf90mpi_enddef: ') ! bufsize must be max of data type converted before and after bufsize = 3*4 - err = nfmpi_buffer_attach(ncid, bufsize) - call check(err, 'In nfmpi_buffer_attach: ') + err = nf90mpi_buffer_attach(ncid, bufsize) + call check(err, 'In nf90mpi_buffer_attach: ') start(1) = 1 count(1) = 3 - err = nfmpi_bput_vara_int(ncid, varid, start, count, ibuf, req(1)) + err = nfmpi_bput_vara_int(ncid, varid, start, count, ibuf(1:), req(1)) call check(err, 'In nfmpi_bput_vara_int: ') - err = nfmpi_wait_all(ncid, 1, req, status) - call check(err, 'In nfmpi_wait_all: ') + err = nf90mpi_wait_all(ncid, 1, req, status) + call check(err, 'In nf90mpi_wait_all: ') if (status(1) .ne. NF90_NOERR) then - print*,'Error at bput status ', nfmpi_strerror(status(1)) + print*,'Error at bput status ', nf90mpi_strerror(status(1)) endif - err = nfmpi_buffer_detach(ncid) - call check(err, 'In nfmpi_buffer_detach: ') + err = nf90mpi_buffer_detach(ncid) + call check(err, 'In nf90mpi_buffer_detach: ') ! close the file err = nf90mpi_close(ncid) diff --git a/test/F90/tst_f90.f90 b/test/F90/tst_f90.f90 index 3369556c9..43f789938 100644 --- a/test/F90/tst_f90.f90 +++ b/test/F90/tst_f90.f90 @@ -82,6 +82,7 @@ program netcdfTest call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) ! take filename from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then filename = FILE_NAME err = get_args(cmd, filename) diff --git a/test/F90/tst_f90_cdf5.f90 b/test/F90/tst_f90_cdf5.f90 index eae87baee..3432524cb 100644 --- a/test/F90/tst_f90_cdf5.f90 +++ b/test/F90/tst_f90_cdf5.f90 @@ -22,6 +22,7 @@ program tst_f90_nc4 call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) ! take filename from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then filename = FILE_NAME err = get_args(cmd, filename) diff --git a/test/F90/tst_types2.f90 b/test/F90/tst_types2.f90 index f1506eb76..a9f8e7b6b 100644 --- a/test/F90/tst_types2.f90 +++ b/test/F90/tst_types2.f90 @@ -43,6 +43,7 @@ program tst_types2 call MPI_Comm_size(MPI_COMM_WORLD, p, ierr) ! take filename from command-line argument if there is any + cmd = ' ' if (my_rank .EQ. 0) then filename = FILE_NAME err = get_args(cmd, filename) diff --git a/test/F90/wrap_runs.sh b/test/F90/wrap_runs.sh index 716aacf06..fcfb29fe9 100755 --- a/test/F90/wrap_runs.sh +++ b/test/F90/wrap_runs.sh @@ -26,8 +26,30 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc # echo "" @@ -36,7 +58,7 @@ for j in ${safe_modes} ; do echo "" echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.bb.nc unset PNETCDF_HINTS ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc @@ -49,6 +71,7 @@ for j in ${safe_modes} ; do fi fi done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.bb.nc diff --git a/test/Makefile.am b/test/Makefile.am index f5eb9d5b4..7d9407403 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -64,6 +64,8 @@ else PTEST_SUBDIRS = $(SUBDIRS) endif +EXTRA_DIST = parallel_run.sh + ptest: @for d in $(PTEST_SUBDIRS) ; do \ $(MAKE) $(MFLAGS) -C $$d ptest $$* || exit 1 ; \ diff --git a/test/adios/parallel_run.sh b/test/adios/parallel_run.sh index 612fd7591..a6602f77c 100755 --- a/test/adios/parallel_run.sh +++ b/test/adios/parallel_run.sh @@ -15,7 +15,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "check_PROGRAMS=${check_PROGRAMS}" # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -26,16 +26,46 @@ unset PNETCDF_HINTS for i in ${check_PROGRAMS} ; do for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" else - export PNETCDF_HINTS= + USEMPIO_HINTS="nc_pncio=enable" fi + for intra_aggr in 0 1 ; do if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + INA_HINTS="nc_num_aggrs_per_node=2" + else + INA_HINTS="nc_num_aggrs_per_node=0" + fi + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" if test "$i" = open ; then ${MPIRUN} ./$i ${srcdir}/arrays.bp diff --git a/test/adios/wrap_runs.sh b/test/adios/wrap_runs.sh index e619098d0..d7647dd15 100755 --- a/test/adios/wrap_runs.sh +++ b/test/adios/wrap_runs.sh @@ -22,8 +22,19 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + if test "$1" = ./open ; then ${TESTSEQRUN} $1 ${srcdir}/arrays.bp ${TESTSEQRUN} $1 ${srcdir}/attributes.bp diff --git a/test/burst_buffer/parallel_run.sh b/test/burst_buffer/parallel_run.sh index d726ee760..73b9d39a1 100755 --- a/test/burst_buffer/parallel_run.sh +++ b/test/burst_buffer/parallel_run.sh @@ -18,7 +18,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "TESTPROGRAMS=${TESTPROGRAMS}" # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -29,17 +29,45 @@ unset PNETCDF_HINTS for i in ${TESTPROGRAMS} ; do for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" else - export PNETCDF_HINTS= + USEMPIO_HINTS="nc_pncio=enable" fi + for intra_aggr in 0 1 ; do if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + INA_HINTS="nc_num_aggrs_per_node=2" + else + INA_HINTS="nc_num_aggrs_per_node=0" + fi + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" saved_PNETCDF_HINTS=${PNETCDF_HINTS} export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" @@ -59,6 +87,7 @@ for i in ${TESTPROGRAMS} ; do ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc done done + done rm -f ${OUTDIR}/$i.nc rm -f ${OUTDIR}/$i.nc*.data rm -f ${OUTDIR}/$i.nc*.meta diff --git a/test/burst_buffer/varn.c b/test/burst_buffer/varn.c index 81e93063e..3913f3700 100644 --- a/test/burst_buffer/varn.c +++ b/test/burst_buffer/varn.c @@ -54,7 +54,7 @@ int main(int argc, char *argv[]) { free(cmd_str); } - /* Initialize file info */ + /* Initialize file info */ MPI_Info_create(&info); MPI_Info_set(info, "nc_burst_buf", "enable"); @@ -83,21 +83,25 @@ int main(int argc, char *argv[]) { /* Standard varn */ err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); CHECK_ERR + for (i=0; i<10; i++) buffer[0] = -1; err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); CHECK_ERR for(i = 0; i < 10; i++){ if (buffer[i] != rank + i){ - nerrs++; printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]); + nerrs++; + goto err_out; } } /* NULL counts */ err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, NULL, buffer); CHECK_ERR + for (i=0; i<10; i++) buffer[0] = -1; err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, NULL, buffer); CHECK_ERR for(i = 0; i < 10; i++){ if (buffer[i] != rank + i){ - nerrs++; printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]); + nerrs++; + goto err_out; } } @@ -106,14 +110,17 @@ int main(int argc, char *argv[]) { Counts[i] = (MPI_Offset*)counts[i]; } err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); CHECK_ERR + for (i=0; i<10; i++) buffer[0] = -1; err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, Counts, buffer); CHECK_ERR for(i = 0; i < 10; i++){ if (buffer[i] != rank + i){ - nerrs++; printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]); + nerrs++; + goto err_out; } } +err_out: /* Close the file */ err = ncmpi_close(ncid); CHECK_ERR diff --git a/test/burst_buffer/wrap_runs.sh b/test/burst_buffer/wrap_runs.sh index 308ccfc19..e7cfeda85 100755 --- a/test/burst_buffer/wrap_runs.sh +++ b/test/burst_buffer/wrap_runs.sh @@ -24,13 +24,39 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + +for bb_mode in 1 ; do + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "$bb_mode" = 1 ; then + PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + fi + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc unset PNETCDF_HINTS ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc done +done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.nc_0_0.data diff --git a/test/cdf_format/Makefile.am b/test/cdf_format/Makefile.am index a8d18918f..d7883e94f 100644 --- a/test/cdf_format/Makefile.am +++ b/test/cdf_format/Makefile.am @@ -75,7 +75,7 @@ ptest ptests ptest4: $(TESTPROGRAMS) @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 || exit 1 ptest2 ptest6 ptest8 ptest10: diff --git a/test/cdf_format/parallel_run.sh b/test/cdf_format/parallel_run.sh index 9f95d0813..dc8677ebf 100755 --- a/test/cdf_format/parallel_run.sh +++ b/test/cdf_format/parallel_run.sh @@ -19,7 +19,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "srcdir = ${srcdir}" # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -29,17 +29,46 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi for intra_aggr in 0 1 ; do - if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" - else - export PNETCDF_HINTS= - fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi - export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + if test "$intra_aggr" = 1 ; then + INA_HINTS="nc_num_aggrs_per_node=2" + else + INA_HINTS="nc_num_aggrs_per_node=0" + fi + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" + export PNETCDF_SAFE_MODE=$j + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${MPIRUN} ./test_inq_format ${srcdir} ${MPIRUN} ./cdf_type ${TESTOUTDIR}/cdf_type.nc ${MPIRUN} ./dim_cdf12 ${TESTOUTDIR}/dim_cdf12.nc @@ -70,6 +99,7 @@ for intra_aggr in 0 1 ; do fi done done +done rm -f ${OUTDIR}/dim_cdf12.nc rm -f ${OUTDIR}/cdf_type.nc diff --git a/test/cdf_format/wrap_runs.sh b/test/cdf_format/wrap_runs.sh index c749c7dbe..f0370fd40 100755 --- a/test/cdf_format/wrap_runs.sh +++ b/test/cdf_format/wrap_runs.sh @@ -26,8 +26,30 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc @@ -35,7 +57,7 @@ for j in ${safe_modes} ; do echo "" echo "---- testing burst buffering" - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.bb.nc unset PNETCDF_HINTS ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc @@ -48,5 +70,6 @@ for j in ${safe_modes} ; do fi fi done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.bb.nc diff --git a/test/cdl/Makefile.am b/test/cdl/Makefile.am index 493d5ccd2..81677ad01 100644 --- a/test/cdl/Makefile.am +++ b/test/cdl/Makefile.am @@ -54,7 +54,7 @@ ptest ptests ptest4: $(TESTPROGRAMS) @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 || exit 1 ptest2 ptest6 ptest8 ptest10: diff --git a/test/common/testutils.h b/test/common/testutils.h index 0dadd1c35..ef3d2593f 100644 --- a/test/common/testutils.h +++ b/test/common/testutils.h @@ -16,6 +16,9 @@ #include #include +#define MODE_COLL 1 +#define MODE_INDEP 0 + #define CHECK_ERR { \ if (err != NC_NOERR) { \ nerrs++; \ @@ -24,6 +27,16 @@ } \ } +#define CHECK_ERR_ALL { \ + if (err != NC_NOERR) { \ + nerrs++; \ + printf("Error at line %d in %s: (%s)\n", \ + __LINE__,__FILE__,ncmpi_strerrno(err)); \ + } \ + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); \ + if (nerrs > 0) goto fn_exit; \ +} + #define CHECK_ERROUT { \ if (err != NC_NOERR) { \ nerrs++; \ @@ -38,7 +51,7 @@ nerrs++; \ printf("Error at line %d in %s: (%s)\n", \ __LINE__,__FILE__,ncmpi_strerrno(err)); \ - MPI_Abort(MPI_COMM_WORLD, -1); \ + goto fn_exit; \ } \ } @@ -50,6 +63,21 @@ } \ } +#define CHECK_EXP_ERR_ALL(exp) { \ + if (err != exp) { \ + nerrs++; \ + printf("Error at line %d in %s: expecting %s but got %s\n", \ + __LINE__,__FILE__,ncmpi_strerrno(exp), ncmpi_strerrno(err)); \ + } \ + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); \ + if (nerrs > 0) goto fn_exit; \ +} + +#define CHECK_NERRS_ALL { \ + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); \ + if (nerrs > 0) goto fn_exit; \ +} + int inq_env_hint(char *hint_key, char **hint_value); #ifdef PNETCDF_DEBUG diff --git a/test/fandc/csnap.c b/test/fandc/csnap.c index 2443086e5..7f81806cc 100644 --- a/test/fandc/csnap.c +++ b/test/fandc/csnap.c @@ -64,7 +64,7 @@ int pe_coords[3]; /* Cartesian PE coords */ /*** function prototypes ***/ -void find_locnx(MPI_Offset nx, int mype, int totpes, MPI_Offset *locnx, MPI_Offset *xbegin); +void find_locnx(MPI_Offset nx, int rank, int nprocs, MPI_Offset *locnx, MPI_Offset *xbegin); void write_file(char *filename, double *t); void read_file(char *filename, double *t); void get_fields(double *tt, double *smf); @@ -390,14 +390,14 @@ void read_file(char *filename, double *t) { } -void find_locnx(MPI_Offset nx, int mype, int totpes, MPI_Offset *locnx, MPI_Offset *xbegin) { +void find_locnx(MPI_Offset nx, int rank, int nprocs, MPI_Offset *locnx, MPI_Offset *xbegin) { MPI_Offset xremain; - *locnx = nx / totpes; - xremain = nx - totpes*(*locnx); - if (mype < xremain) (*locnx)++; - *xbegin = mype*(nx/totpes) + xremain; - if (mype < xremain) *xbegin += mype - xremain; + *locnx = nx / nprocs; + xremain = nx - nprocs*(*locnx); + if (rank < xremain) (*locnx)++; + *xbegin = rank*(nx/nprocs) + xremain; + if (rank < xremain) *xbegin += rank - xremain; } diff --git a/test/header/Makefile.am b/test/header/Makefile.am index 8080f6885..2dddc7e44 100644 --- a/test/header/Makefile.am +++ b/test/header/Makefile.am @@ -60,14 +60,14 @@ ptest ptest4: $(check_PROGRAMS) @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 || exit 1 ptest2: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 2 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 2 || exit 1 + $(srcdir)/../parallel_run.sh 2 || exit 1 ptests: ptest2 ptest4 ptest6 ptest8 ptest10: diff --git a/test/header/parallel_run.sh b/test/header/parallel_run.sh index a4d0770fa..a339fd83f 100755 --- a/test/header/parallel_run.sh +++ b/test/header/parallel_run.sh @@ -18,7 +18,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "check_PROGRAMS=${check_PROGRAMS}" # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -29,16 +29,46 @@ unset PNETCDF_HINTS for i in ${check_PROGRAMS} ; do for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" else - export PNETCDF_HINTS= + USEMPIO_HINTS="nc_pncio=enable" fi + for intra_aggr in 0 1 ; do if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + INA_HINTS="nc_num_aggrs_per_node=2" + else + INA_HINTS="nc_num_aggrs_per_node=0" + fi + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc @@ -68,6 +98,7 @@ for i in ${check_PROGRAMS} ; do fi done done + done rm -f ${OUTDIR}/$i.nc rm -f ${OUTDIR}/$i.bb.nc done diff --git a/test/header/seq_runs.sh b/test/header/seq_runs.sh index 475c8da8a..f35af292e 100755 --- a/test/header/seq_runs.sh +++ b/test/header/seq_runs.sh @@ -24,8 +24,30 @@ unset PNETCDF_HINTS # header consistency tests are designed to run on more than one MPI process for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc @@ -42,3 +64,4 @@ for j in ${safe_modes} ; do ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$outfile.nc ${TESTOUTDIR}/$outfile.bb.nc fi done +done diff --git a/test/largefile/Makefile.am b/test/largefile/Makefile.am index d74134b76..f5a014332 100644 --- a/test/largefile/Makefile.am +++ b/test/largefile/Makefile.am @@ -90,7 +90,7 @@ ptest ptest4: $(check_PROGRAMS) @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 || exit 1 ptests: ptest4 ptest2 ptest6 ptest8 ptest10: diff --git a/test/nc4/parallel_run.sh b/test/nc4/parallel_run.sh index 6e0dfa371..85dc275bd 100755 --- a/test/nc4/parallel_run.sh +++ b/test/nc4/parallel_run.sh @@ -15,7 +15,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"` # echo "check_PROGRAMS=${check_PROGRAMS}" # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -26,20 +26,15 @@ unset PNETCDF_HINTS for i in ${check_PROGRAMS} ; do for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode export PNETCDF_HINTS="romio_no_indep_rw=true" else export PNETCDF_HINTS= fi - if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" - fi export PNETCDF_SAFE_MODE=$j # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc done - done rm -f ${OUTDIR}/$i.nc rm -f ${OUTDIR}/$i.nc.cdf4 done diff --git a/test/nc4/wrap_runs.sh b/test/nc4/wrap_runs.sh index 10d76802b..885d31b70 100755 --- a/test/nc4/wrap_runs.sh +++ b/test/nc4/wrap_runs.sh @@ -23,10 +23,33 @@ fi unset PNETCDF_HINTS for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc done +done rm -f ${OUTDIR}/$outfile.nc rm -f ${OUTDIR}/$outfile.nc.cdf4 diff --git a/test/nc_test/t_nc.c b/test/nc_test/t_nc.c index c15b8d52d..1f1fb0873 100644 --- a/test/nc_test/t_nc.c +++ b/test/nc_test/t_nc.c @@ -112,32 +112,32 @@ static MPI_Offset sizes[] = { NC_UNLIMITED, SIZE_1 , SIZE_2 }; static const char * const dim_names[] = { "record", "ixx", "iyy"}; static int -createtestdims(int cdfid, size_t num_dims, const MPI_Offset *sizes, const char * const dim_names[]) +createtestdims(int cdfid, size_t ndims, const MPI_Offset *dim_sizes, const char * const names[]) { int dimid, err; - while(num_dims-- != 0) + while(ndims-- != 0) { - err = ncmpi_def_dim(cdfid, *dim_names++, *sizes, &dimid); ERR - sizes++; + err = ncmpi_def_dim(cdfid, *names++, *dim_sizes, &dimid); ERR + dim_sizes++; } return 0; } static int -testdims(int cdfid, size_t num_dims, MPI_Offset *sizes, const char * const dim_names[]) +testdims(int cdfid, size_t ndims, MPI_Offset *dim_sizes, const char * const names[]) { int ii, err; MPI_Offset size; char cp[NC_MAX_NAME]; - for(ii=0; (size_t) ii < num_dims; ii++, sizes++) + for(ii=0; (size_t) ii < ndims; ii++, dim_sizes++) { err = ncmpi_inq_dim(cdfid, ii, cp, &size); ERR - if( size != *sizes) + if( size != *dim_sizes) (void) fprintf(stderr, "%d: %lu != %lu\n", - ii, (unsigned long)size, (unsigned long)*sizes); - if ( size != *sizes) return 1; - if ( strcmp(cp, *dim_names++) != 0) return 1; + ii, (unsigned long)size, (unsigned long)*dim_sizes); + if ( size != *dim_sizes) return 1; + if ( strcmp(cp, *names++) != 0) return 1; } return 0; } @@ -195,11 +195,11 @@ static struct tcdfvar { #define NUM_TESTVARS 6 static int -createtestvars(int id, const struct tcdfvar *testvars, size_t count) +createtestvars(int id, const struct tcdfvar *vars, size_t count) { int ii, err; int varid; - const struct tcdfvar *vp = testvars; + const struct tcdfvar *vp = vars; for(ii = 0; (size_t) ii < count; ii++, vp++ ) { diff --git a/test/nc_test/test_iput.m4 b/test/nc_test/test_iput.m4 index d9f0da58c..ac0e18804 100644 --- a/test/nc_test/test_iput.m4 +++ b/test/nc_test/test_iput.m4 @@ -76,12 +76,12 @@ define(`CheckRange3', #include "tests.h" static double -hash2nc(const nc_type var_type, int var_rank, MPI_Offset *index) +hash2nc(const nc_type xtype, int v_rank, MPI_Offset *index) { double min; double max; - switch (var_type) { + switch (xtype) { /* no type conversion will happen for NC_CHAR, use in-memory limits */ case NC_CHAR: min = CHAR_MIN; max = (double)CHAR_MAX; break; case NC_BYTE: min = X_BYTE_MIN; max = (double)X_BYTE_MAX; break; @@ -98,16 +98,16 @@ hash2nc(const nc_type var_type, int var_rank, MPI_Offset *index) return NC_EBADTYPE; } - return MAX(min, MIN(max, hash(var_type, var_rank, index))); + return MAX(min, MIN(max, hash(xtype, v_rank, index))); } static int -dbls2ncs(size_t nels, int var_type, double *inBuf, void *outBuf) +dbls2ncs(size_t nels, int xtype, double *inBuf, void *outBuf) { size_t i; char *p = (char*)outBuf; for (i=0; i0) { + c_wbuf[0] = (long long*) malloc(sizeof(long long) * bufsize); + c_rbuf[0] = (long long*) malloc(sizeof(long long) * bufsize); + for (i=1; i0) { - cbuffer[0] = (long long*) malloc(sizeof(long long) * bufsize); - for (i=1; i0) free(cbuffer[0]); - for (i=0; i @@ -298,9 +284,6 @@ distribution. All test programs are designed to run on 4 MPI processes. *** TESTING C test_erange for checking for NC_ERANGE ------ pass *** TESTING C test_fillvalue for _FillValue for NC_GLOBAL ------ pass *** TESTING C test_get_varn for get_varn ------ pass - *** TESTING C test_vard for vard put and get ------ pass - *** TESTING C test_vard_multiple for vard to 2 variables ------ pass - *** TESTING C test_vard_rec for vard put on record var ------ pass *** TESTING C test_varm for get/put varm ------ pass *** TESTING C tst_def_var_fill for def_var_fill ------ pass *** TESTING C tst_dimsizes for defining max dimension sizes ------ pass @@ -336,8 +319,6 @@ distribution. All test programs are designed to run on 4 MPI processes. *** TESTING C examples/C/time_var ------ pass *** TESTING C examples/C/transpose2D ------ pass *** TESTING C examples/C/transpose ------ pass - *** TESTING C examples/C/vard_int ------ pass - *** TESTING C examples/C/vard_mvars ------ pass *** TESTING F77 examples/F77/block_cyclic.exe77 ------ pass *** TESTING F77 examples/F77/bput_varn_int8.exe77 ------ pass *** TESTING F77 examples/F77/column_wise.exe77 ------ pass @@ -352,7 +333,6 @@ distribution. All test programs are designed to run on 4 MPI processes. *** TESTING F77 examples/F77/put_varn_real.exe77 ------ pass *** TESTING F77 examples/F77/time_var.exe77 ------ pass *** TESTING F77 examples/F77/transpose.exe77 ------ pass - *** TESTING F77 examples/F77/vard_int.exe77 ------ pass *** TESTING F90 examples/F90/block_cyclic.exe90 ------ pass *** TESTING F90 examples/F90/column_wise.exe90 ------ pass *** TESTING F90 examples/F90/fill_mode.exe90 ------ pass @@ -364,7 +344,6 @@ distribution. All test programs are designed to run on 4 MPI processes. *** TESTING F90 examples/F90/put_varn_int.exe90 ------ pass *** TESTING F90 examples/F90/put_varn_real.exe90 ------ pass *** TESTING F90 examples/F90/transpose.exe90 ------ pass - *** TESTING F90 examples/F90/vard_int.exe90 ------ pass Total number of tested programs: 105 diff --git a/test/test_installed/makefile b/test/test_installed/makefile index b643b4925..87b9a9a18 100644 --- a/test/test_installed/makefile +++ b/test/test_installed/makefile @@ -61,9 +61,6 @@ testcases_src = ../testcases/add_var.c \ ../testcases/test_erange.c \ ../testcases/test_fillvalue.c \ ../testcases/test_get_varn.c \ - ../testcases/test_vard.c \ - ../testcases/test_vard_multiple.c \ - ../testcases/test_vard_rec.c \ ../testcases/test_varm.c \ ../testcases/tst_def_var_fill.c \ ../testcases/tst_del_attr.c \ @@ -109,10 +106,7 @@ examples_C_src = ../../examples/C/block_cyclic.c \ ../../examples/C/put_varn_int.c \ ../../examples/C/time_var.c \ ../../examples/C/transpose2D.c \ - ../../examples/C/transpose.c \ - ../../examples/C/vard_bottom.c \ - ../../examples/C/vard_int.c \ - ../../examples/C/vard_mvars.c + ../../examples/C/transpose.c EXAMPLE_PROGS += $(examples_C_src:../../examples/C/%.c=%) %.o: ../../examples/C/%.c $(CC) $(CFLAGS) -c $< @@ -146,8 +140,7 @@ examples_F77_src = ../../examples/F77/block_cyclic.f \ ../../examples/F77/put_varn_int.f \ ../../examples/F77/put_varn_real.f \ ../../examples/F77/time_var.f \ - ../../examples/F77/transpose.f \ - ../../examples/F77/vard_int.f + ../../examples/F77/transpose.f EXAMPLE_PROGS += $(examples_F77_src:../../examples/F77/%.f=%.exe77) %.77o: ../../examples/F77/%.f @@ -163,8 +156,7 @@ examples_F90_src = ../../examples/F90/block_cyclic.f90 \ ../../examples/F90/put_var.f90 \ ../../examples/F90/put_varn_int.f90 \ ../../examples/F90/put_varn_real.f90 \ - ../../examples/F90/transpose.f90 \ - ../../examples/F90/vard_int.f90 + ../../examples/F90/transpose.f90 EXAMPLE_PROGS += $(examples_F90_src:../../examples/F90/%.f90=%.exe90) all: env_check testutils.o utils.o $(TEST_PROGS) $(EXAMPLE_PROGS) batch.sh interactive.sh diff --git a/test/testcases/Makefile.am b/test/testcases/Makefile.am index 5c8a8a25c..41636d39f 100644 --- a/test/testcases/Makefile.am +++ b/test/testcases/Makefile.am @@ -98,7 +98,8 @@ TESTPROGRAMS = file_create_open \ test_get_varn \ tst_del_attr \ tst_redefine \ - tst_grow_header + tst_grow_header \ + tst_varn_var1 M4_SRCS = put_all_kinds.m4 \ erange_fill.m4 \ @@ -220,7 +221,7 @@ CLEANFILES = $(M4_SRCS:.m4=.c) core core.* *.gcda *.gcno *.gcov gmon.out \ $(NC_FILES) EXTRA_DIST = $(M4_SRCS) seq_runs.sh redef-good.ncdump \ - wrap_runs.sh parallel_run.sh + wrap_runs.sh # Some of these tests are designed to run on one process, # Run them on 4 processes to see if they can handle well @@ -232,21 +233,21 @@ ptest ptest4: $(check_PROGRAMS) @echo " $(subdir): Parallel testing on 4 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 4 || exit 1 + $(srcdir)/../parallel_run.sh 4 || exit 1 ptest2: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 2 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 2 || exit 1 + $(srcdir)/../parallel_run.sh 2 || exit 1 ptest6: $(check_PROGRAMS) @echo "===========================================================" @echo " $(subdir): Parallel testing on 6 MPI processes" @echo "===========================================================" @$(TESTS_ENVIRONMENT) \ - $(srcdir)/parallel_run.sh 6 || exit 1 + $(srcdir)/../parallel_run.sh 6 || exit 1 ptests: ptest2 ptest4 ptest6 ptest8 ptest10: diff --git a/test/testcases/add_var.c b/test/testcases/add_var.c index 002942014..27bcf8de0 100644 --- a/test/testcases/add_var.c +++ b/test/testcases/add_var.c @@ -58,6 +58,7 @@ tst_fmt(char *filename, int cmode) for (i=0; i<10; i++) { sprintf(var_name, "var_%d", i); err = ncmpi_def_var(ncid, var_name, NC_INT, 2, dimid, &varid); CHECK_ERR + err = ncmpi_def_var_fill(ncid, varid, 0, NULL); CHECK_ERR } err = ncmpi_enddef(ncid); CHECK_ERR @@ -70,7 +71,9 @@ tst_fmt(char *filename, int cmode) /* add 2 new variables */ err = ncmpi_def_var(ncid, "new_var1", NC_INT, 2, dimid, &varid); CHECK_ERR + err = ncmpi_def_var_fill(ncid, varid, 0, NULL); CHECK_ERR err = ncmpi_def_var(ncid, "new_var2", NC_FLOAT, 2, dimid, &varid); CHECK_ERR + err = ncmpi_def_var_fill(ncid, varid, 0, NULL); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR err = ncmpi_inq_nvars(ncid, &nvars); CHECK_ERR diff --git a/test/testcases/alignment_test.c b/test/testcases/alignment_test.c index 9646bbe12..c04a2377c 100644 --- a/test/testcases/alignment_test.c +++ b/test/testcases/alignment_test.c @@ -33,8 +33,9 @@ #define NVARS 8 #define NX 5 -int main(int argc, char** argv) { - char filename[256]; +static int tst_mode(char *filename, + int mode) +{ int i, j, rank, nprocs, err, verbose=0, nerrs=0; int ncid, cmode, varid[NVARS], dimid[2], *buf; char str[32]; @@ -43,25 +44,9 @@ int main(int argc, char** argv) { MPI_Offset header_size[2], header_extent[2]; MPI_Info info=MPI_INFO_NULL; - MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "redef1.nc"); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for alignment ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - /* create a new file for writing ----------------------------------------*/ cmode = NC_CLOBBER | NC_64BIT_DATA; err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid); @@ -90,6 +75,11 @@ int main(int argc, char** argv) { } err = ncmpi_enddef(ncid); CHECK_ERR + if (mode != MODE_COLL) { + err = ncmpi_begin_indep_data(ncid); + CHECK_ERR + } + /* write all variables */ buf = (int*) malloc(sizeof(int) * NX); for (i=0; i 2) { + if (!rank) printf("Usage: %s [filename]\n",argv[0]); + MPI_Finalize(); + return 1; + } + if (argc == 2) snprintf(filename, 256, "%s", argv[1]); + else strcpy(filename, "testfile.nc"); + + if (rank == 0) { + char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); + sprintf(cmd_str, "*** TESTING C %s for alignment ", basename(argv[0])); + printf("%-66s ------ ", cmd_str); fflush(stdout); + free(cmd_str); + } + + nerrs += tst_mode(filename, MODE_COLL); + if (nerrs > 0) goto err_out; + + nerrs += tst_mode(filename, MODE_INDEP); + if (nerrs > 0) goto err_out; + /* check if PnetCDF freed all internal malloc */ MPI_Offset malloc_size, sum_size; err = ncmpi_inq_malloc_size(&malloc_size); @@ -328,13 +393,13 @@ int main(int argc, char** argv) { if (malloc_size > 0) ncmpi_inq_malloc_list(); } +err_out: MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); else printf(PASS_STR); } -err_out: MPI_Finalize(); return (nerrs > 0); } diff --git a/test/testcases/buftype_free.c b/test/testcases/buftype_free.c index 90ffc7645..0d8b2fe7b 100644 --- a/test/testcases/buftype_free.c +++ b/test/testcases/buftype_free.c @@ -22,16 +22,18 @@ #define NY 4 #define NX 4 +#define NVARS 4 +#define NGHOSTS 2 /*----< main() >------------------------------------------------------------*/ int main(int argc, char **argv) { char filename[256]; - int i, j, err, ncid, varid[4], dimids[2], req[4], st[4], nerrs=0; - int rank, nprocs, buf[4][(NY+4)*(NX+4)]; + int i, j, err, ncid, varid[NVARS], dimids[2], req[NVARS], st[NVARS], nerrs=0; + int rank, nprocs, *buf[NVARS]; int gsize[2], subsize[2], a_start[2], ghost; MPI_Offset start[2], count[2]; - MPI_Datatype buftype[4]; + MPI_Datatype buftype[NVARS]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); @@ -65,7 +67,14 @@ int main(int argc, char **argv) { err = ncmpi_enddef(ncid); CHECK_ERR /* initialize the contents of the array */ - for (i=0; i<4; i++) for (j=0; j<(NY+4)*(NX+4); j++) buf[i][j] = rank+10; + ghost = NGHOSTS; + gsize[1] = NX + 2 * ghost; + gsize[0] = NY + 2 * ghost; + + for (i=0; i 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS) - else EXP_ERR(NC_NOERR) + if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS; + else exp = NC_NOERR; + CHECK_EXP_ERR_ALL(exp) /* check if user put buffer contents altered */ if (buf[0] != 1.0) { @@ -86,8 +87,9 @@ int test_collective_error(char *filename, int safe_mode, int cmode) } err = ncmpi_put_vara_double_all(ncid, varid, start, count, buf); - if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS) - else EXP_ERR(NC_NOERR) + if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS; + else exp = NC_NOERR; + CHECK_EXP_ERR_ALL(exp) /* check if user put buffer contents altered */ if (buf[0] != 1.0) { @@ -103,12 +105,10 @@ int test_collective_error(char *filename, int safe_mode, int cmode) if (!(cmode & NC_NETCDF4)) { err = ncmpi_iput_vara_double(ncid, varid, start, count, buf, &req); - if (rank == 1) - EXP_ERR(NC_EINVALCOORDS) - else - EXP_ERR(NC_NOERR) + exp = (rank == 1) ? NC_EINVALCOORDS : NC_NOERR; + EXP_ERR(exp) - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR_ALL /* check if user put buffer contents altered */ if (buf[0] != 1.0) { @@ -125,25 +125,26 @@ int test_collective_error(char *filename, int safe_mode, int cmode) err = ncmpi_get_vara_all(ncid, varid, start, count, buf, count[0], MPI_DOUBLE); - if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS) - else EXP_ERR(NC_NOERR) + if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS; + else exp = NC_NOERR; + CHECK_EXP_ERR_ALL(exp) err = ncmpi_get_vara_double_all(ncid, varid, start, count, buf); - if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS) - else EXP_ERR(NC_NOERR) + if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS; + else exp = NC_NOERR; + CHECK_EXP_ERR_ALL(exp) if (!(cmode & NC_NETCDF4)) { err = ncmpi_iget_vara_double(ncid, varid, start, count, buf, &req); - if (rank == 1) - EXP_ERR(NC_EINVALCOORDS) - else - EXP_ERR(NC_NOERR) + exp = (rank == 1) ? NC_EINVALCOORDS : NC_NOERR; + EXP_ERR(exp) - err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR + err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR_ALL } - err = ncmpi_close(ncid); CHECK_ERR + err = ncmpi_close(ncid); CHECK_ERR_ALL +fn_exit: return nerrs; } diff --git a/test/testcases/error_precedence.m4 b/test/testcases/error_precedence.m4 index 6b26b0c38..4732a9d08 100644 --- a/test/testcases/error_precedence.m4 +++ b/test/testcases/error_precedence.m4 @@ -196,6 +196,9 @@ test_format_nc$1(char *filename) foreach(`itype',(text, TYPE_LIST),`_CAT(` err=API(def_var)(ncid,"var_'itype`",NC_TYPE(itype),2,dimids,&vid_',itype`); CHECK_ERR')') + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err=API(def_var_fill)(ncid, vid_'itype`, 0, NULL); CHECK_ERR')') + /* For put attribute APIs, the error precedence is the following: * NC_EBADID, NC_EPERM, NC_ENOTVAR, NC_EBADNAME, NC_EBADTYPE, NC_ECHAR, * NC_EINVAL, NC_ENOTINDEFINE, NC_ERANGE diff --git a/test/testcases/mix_collectives.c b/test/testcases/mix_collectives.c index 583a29599..649624242 100644 --- a/test/testcases/mix_collectives.c +++ b/test/testcases/mix_collectives.c @@ -181,7 +181,7 @@ int main(int argc, char **argv) __LINE__,__FILE__,i,g_buf[i],check_buf[i]); nerrs++; free(check_buf); - goto err_out; + goto syn_err; } } } @@ -207,7 +207,7 @@ int main(int argc, char **argv) printf("Error at line %d in %s: expecting var[%d]=%d but got %d\n", __LINE__,__FILE__,i, j*4+i + rank*100, buf[j][i]); nerrs++; - goto err_out; + goto syn_err; } } } @@ -222,7 +222,7 @@ int main(int argc, char **argv) printf("Error at line %d in %s: expecting var[%d]=%d but got %d\n", __LINE__,__FILE__,j, j+rank*100, *val); nerrs++; - goto err_out; + goto syn_err; } val++; } @@ -238,7 +238,7 @@ int main(int argc, char **argv) printf("Error at line %d in %s: expecting var[%d][%d]=%d but got %d\n", __LINE__,__FILE__,j,i, j*4+i + rank*100, buf[j][i]); nerrs++; - goto err_out; + goto syn_err; } } } @@ -251,11 +251,15 @@ int main(int argc, char **argv) printf("Error at line %d in %s: expecting var[%d][%d]=%d but got %d\n", __LINE__,__FILE__,j,i, -1, buf[j][i]); nerrs++; - goto err_out; + goto syn_err; } } } +syn_err: + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + if (nerrs) goto err_out; + /* test when different processes call put APIs with different varid */ err = ncmpi_redef(ncid); CHECK_ERR err = ncmpi_def_var(ncid, "scalar0", NC_INT, 0, NULL, &varids[0]); CHECK_ERR diff --git a/test/testcases/ncmpi_vars_null_stride.c b/test/testcases/ncmpi_vars_null_stride.c index 0cfd81aac..40884d966 100644 --- a/test/testcases/ncmpi_vars_null_stride.c +++ b/test/testcases/ncmpi_vars_null_stride.c @@ -30,11 +30,13 @@ #define NY 4 #define NX 2 +static int verbose; + static int tst_fmt(char *filename, int cmode) { int err, nerrs=0, ncid, dimid[NDIMS], varid[5], ndims=NDIMS; - int i, j, k, nprocs, rank, req, *buf; + int i, j, k, nprocs, rank, req, *buf=NULL; MPI_Offset start[NDIMS] = {0}; MPI_Offset count[NDIMS] = {0}; MPI_Offset stride[NDIMS] = {0}; @@ -54,12 +56,13 @@ tst_fmt(char *filename, int cmode) err = ncmpi_def_var(ncid, "v4", NC_INT, ndims, dimid, &varid[4]); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + buf = (int*) malloc(sizeof(int) * NY * NX); + for (i=0; i------------------------------------------------------------*/ -int main(int argc, char **argv) { - - char filename[256]; - int i, j, err, ncid, varid, dimids[2], req[2], st[2], nerrs=0; - int rank, nprocs, buf[NY+1][NX]; +static +int tst_mode(const char *filename, + int mode, + MPI_Info info) +{ + int i, j, err, ncid, varid, dimids[2], req[2], st[2], nerrs=0; + int rank, nprocs, buf[NY+1][NX]; MPI_Offset start[2], count[2]; - MPI_Info info; - MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (argc > 2) { - if (!rank) printf("Usage: %s [filename]\n",argv[0]); - MPI_Finalize(); - return 1; - } - if (argc == 2) snprintf(filename, 256, "%s", argv[1]); - else strcpy(filename, "testfile.nc"); - MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); - - if (rank == 0) { - char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); - sprintf(cmd_str, "*** TESTING C %s for using ncmpi_iput_vara_int() ", basename(argv[0])); - printf("%-66s ------ ", cmd_str); fflush(stdout); - free(cmd_str); - } - - MPI_Info_create(&info); - /* When using PVFS2, unexpected buffer value error message might occur. - * This is due to a possible bug in ADIOI_PVFS2_OldWriteStrided() when - * filetype is contiguous and buftype is non-contiguous. - * Fix: Add ROMIO hint to force ADIO driever to use POSIX I/O */ - /* MPI_Info_set(info, "romio_pvfs2_posix_write", "enable"); */ - - err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR - MPI_Info_free(&info); + err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); + CHECK_FATAL_ERR /* define a 2D array */ err = ncmpi_def_dim(ncid, "Y", NC_UNLIMITED, &dimids[0]); CHECK_ERR @@ -88,6 +64,10 @@ int main(int argc, char **argv) { err = ncmpi_def_var(ncid, "var", NC_INT, 2, dimids, &varid); CHECK_ERR err = ncmpi_enddef(ncid); CHECK_ERR + if (mode == MODE_INDEP) { + err = ncmpi_sync(ncid); CHECK_ERR + } + /* initialize the contents of the array */ for (j=0; j------------------------------------------------------------*/ +int main(int argc, char **argv) { + + char filename[256]; + int err, nerrs=0, rank; + MPI_Info info; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + if (argc > 2) { + if (!rank) printf("Usage: %s [filename]\n",argv[0]); + MPI_Finalize(); + return 1; + } + if (argc == 2) snprintf(filename, 256, "%s", argv[1]); + else strcpy(filename, "testfile.nc"); + MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); + + if (rank == 0) { + char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); + sprintf(cmd_str, "*** TESTING C %s for using ncmpi_iput_vara_int() ", basename(argv[0])); + printf("%-66s ------ ", cmd_str); fflush(stdout); + free(cmd_str); + } + + MPI_Info_create(&info); + /* When using PVFS2, unexpected buffer value error message might occur. + * This is due to a possible bug in ADIOI_PVFS2_OldWriteStrided() when + * filetype is contiguous and buftype is non-contiguous. + * Fix: Add ROMIO hint to force MPI-IO to use POSIX I/O driver */ + /* MPI_Info_set(info, "romio_pvfs2_posix_write", "enable"); */ + + /* disable internal buffering for small non-blocking APIs */ + MPI_Info_set(info, "nc_ibuf_size", "0"); + + nerrs = tst_mode(filename, MODE_COLL, MPI_INFO_NULL); + if (nerrs > 0) goto err_out; + + nerrs = tst_mode(filename, MODE_INDEP, MPI_INFO_NULL); + if (nerrs > 0) goto err_out; + + nerrs = tst_mode(filename, MODE_COLL, info); + if (nerrs > 0) goto err_out; + + nerrs = tst_mode(filename, MODE_INDEP, info); + if (nerrs > 0) goto err_out; + /* check if PnetCDF freed all internal malloc */ MPI_Offset malloc_size, sum_size; err = ncmpi_inq_malloc_size(&malloc_size); @@ -155,6 +212,9 @@ int main(int argc, char **argv) { if (malloc_size > 0) ncmpi_inq_malloc_list(); } +err_out: + MPI_Info_free(&info); + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); if (rank == 0) { if (nerrs) printf(FAIL_STR,nerrs); diff --git a/test/testcases/null_args.m4 b/test/testcases/null_args.m4 index 65484f380..4fed908bc 100644 --- a/test/testcases/null_args.m4 +++ b/test/testcases/null_args.m4 @@ -295,11 +295,18 @@ test_format_nc$1(char *filename) /* define variables */dnl foreach(`itype',(text, TYPE_LIST),`_CAT(` err = ncmpi_def_var(ncid,"var_'itype`",NC_TYPE(itype),2,dimid,&vid_',itype`); - EXP_ERR_MSG(NC_NOERR,"def_var")')') + EXP_ERR_MSG(NC_NOERR,"def_var") + err = ncmpi_def_var_fill(ncid, vid_'itype`, 0, NULL); + EXP_ERR_MSG(NC_NOERR,"def_var_fill")')') err = ncmpi_enddef(ncid); EXP_ERR_MSG(NC_NOERR,"enddef") + /* fill the 1st record of all variables */dnl + foreach(`itype',(text, TYPE_LIST),`_CAT(` + err = ncmpi_fill_var_rec(ncid, vid_'itype`, 0); + EXP_ERR_MSG(NC_NOERR,"fill_var_rec")')') + start[0] = start[1] = 0; count[0] = count[1] = 1; stride[0] = stride[1] = 1; diff --git a/test/testcases/parallel_run.sh b/test/testcases/parallel_run.sh index 0abc8b12f..4418f27e0 100755 --- a/test/testcases/parallel_run.sh +++ b/test/testcases/parallel_run.sh @@ -21,7 +21,7 @@ OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-` NTHREADS=`expr $1 \* 6 - 1` # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}" -if test ${PNETCDF_DEBUG} = 1 ; then +if test "x${PNETCDF_DEBUG}" = x1 ; then safe_modes="0 1" else safe_modes="0" @@ -32,17 +32,53 @@ unset PNETCDF_HINTS for i in ${check_PROGRAMS} ; do for j in ${safe_modes} ; do - for intra_aggr in 0 1 ; do if test "$j" = 1 ; then # test only in safe mode - export PNETCDF_HINTS="romio_no_indep_rw=true" + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi + OUT_PREFIX="${TESTOUTDIR}/$i" + + for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + DRIVER_OUT_FILE="${OUT_PREFIX}.mpio" else - export PNETCDF_HINTS= + USEMPIO_HINTS="nc_pncio=enable" + DRIVER_OUT_FILE="${OUT_PREFIX}.pncio" fi + for intra_aggr in 0 1 ; do if test "$intra_aggr" = 1 ; then - export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2" + INA_HINTS="nc_num_aggrs_per_node=2" + INA_OUT_FILE="${DRIVER_OUT_FILE}.ina" + else + INA_HINTS="nc_num_aggrs_per_node=0" + INA_OUT_FILE="${DRIVER_OUT_FILE}" + fi + + OUT_FILE=$INA_OUT_FILE + + if [[ "$i" == *"vard"* ]] ; then + if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then + # vard APIs are not supported when using PNCIO + continue + fi + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" fi + if test "x$INA_HINTS" != x ; then + PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" export PNETCDF_SAFE_MODE=$j - # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" if test "$i" = tst_version ; then ${MPIRUN} ./tst_version @@ -51,25 +87,25 @@ for i in ${check_PROGRAMS} ; do if test "$i" = tst_pthread ; then # each MPI process created 6 threads - ${MPIRUN} ./tst_pthread ${TESTOUTDIR}/tst_pthread.nc + ${MPIRUN} ./tst_pthread ${OUT_FILE}.nc for k in `seq 0 ${NTHREADS}` ; do - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/tst_pthread.nc.$k + ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc.$k rm -f ${OUTDIR}/tst_pthread.nc.$k done continue fi - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc + ${MPIRUN} ./$i ${OUT_FILE}.nc # put_all_kinds and iput_all_kinds output 3 files if test "$i" = put_all_kinds -o "$i" = iput_all_kinds ; then for k in 1 2 5 ; do - # echo "--- validating file ${TESTOUTDIR}/$i.nc$k" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc$k + # echo "--- validating file ${OUT_FILE}.nc$k" + ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc$k done else - # echo "--- validating file ${TESTOUTDIR}/$i.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc + # echo "--- validating file ${OUT_FILE}.nc" + ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc fi # echo "" @@ -77,21 +113,21 @@ for i in ${check_PROGRAMS} ; do # echo "---- test burst buffering feature" saved_PNETCDF_HINTS=${PNETCDF_HINTS} export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.bb.nc + ${MPIRUN} ./$i ${OUT_FILE}.bb.nc export PNETCDF_HINTS=${saved_PNETCDF_HINTS} # put_all_kinds and iput_all_kinds output 3 files if test "$i" = put_all_kinds -o "$i" = iput_all_kinds ; then for k in 1 2 5 ; do - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc$k" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc$k + # echo "--- validating file ${OUT_FILE}.bb.nc$k" + ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc$k # echo "--- ncmpidiff $i.nc$k $i.bb.nc$k ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc$k ${TESTOUTDIR}/$i.bb.nc$k + ${MPIRUN} ${NCMPIDIFF} -q ${OUT_FILE}.nc$k ${OUT_FILE}.bb.nc$k done continue else - # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc" - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc + # echo "--- validating file ${OUT_FILE}.bb.nc" + ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc fi # skip ncmpidiff for large file @@ -100,17 +136,57 @@ for i in ${check_PROGRAMS} ; do fi # echo "--- ncmpidiff $i.nc $i.bb.nc ---" - ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc + ${MPIRUN} ${NCMPIDIFF} -q ${OUT_FILE}.nc ${OUT_FILE}.bb.nc fi if test "x${ENABLE_NETCDF4}" = x1 ; then # echo "test netCDF-4 feature" - ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc4 4 + ${MPIRUN} ./$i ${OUT_FILE}.nc4 4 # Validator does not support nc4 fi - done - done - rm -f ${OUTDIR}/$i.nc* - rm -f ${OUTDIR}/$i.bb.nc* -done + done # intra_aggr + done # mpiio_mode + + if test "$i" = tst_version ; then + # this program creates no output file + continue + fi + if [[ "$i" == *"vard"* ]] ; then + continue + fi + + DIFF_OPT="-q" + if test "$i" = last_large_var ; then + DIFF_OPT+=" -h" + fi + if test "$i" = put_all_kinds || test "$i" = iput_all_kinds ; then + for j in 1 2 5; do + # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.mpio.ina.nc$j ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.mpio.ina.nc$j + # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.nc$j ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.nc$j + # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.ina.nc$j ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.ina.nc$j + done + elif test "$i" = tst_pthread ; then + for j in `seq 0 ${NTHREADS}` ; do + # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.mpio.ina.nc.$j ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.mpio.ina.nc.$j + # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.nc.$j ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.nc.$j + # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.ina.nc.$j ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.ina.nc.$j + done + else + # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc + # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc + # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc ---" + $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc + fi + + done # safe_modes + rm -f ${OUTDIR}/$i*nc* +done # check_PROGRAMS diff --git a/test/testcases/seq_runs.sh b/test/testcases/seq_runs.sh index 270536cc6..32a01bfcb 100755 --- a/test/testcases/seq_runs.sh +++ b/test/testcases/seq_runs.sh @@ -12,69 +12,96 @@ VALIDATOR=../../src/utils/ncvalidator/ncvalidator # prevent user environment setting of PNETCDF_HINTS to interfere unset PNETCDF_HINTS -${TESTSEQRUN} ./tst_version - -${TESTSEQRUN} ./put_all_kinds ${TESTOUTDIR}/put_all_kinds.nc -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc1 -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc2 -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc5 - -${TESTSEQRUN} ./iput_all_kinds ${TESTOUTDIR}/iput_all_kinds.nc -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/iput_all_kinds.nc1 -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/iput_all_kinds.nc2 -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/iput_all_kinds.nc5 - -NCMPIGEN=../../src/utils/ncmpigen/ncmpigen -NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff - -# remove the file system type prefix name if there is any. -OUT_PATH=`echo "$TESTOUTDIR" | cut -d: -f2-` - -rm -f ${OUT_PATH}/testfile.nc ${OUT_PATH}/redef1.nc -${TESTSEQRUN} ${NCMPIGEN} -v 5 -o ${TESTOUTDIR}/redef1.nc ${srcdir}/redef-good.ncdump -${TESTSEQRUN} ./redef1 ${TESTOUTDIR}/testfile.nc -${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/testfile.nc ${TESTOUTDIR}/redef1.nc -# diff -q ${OUT_PATH}/testfile.nc ${OUT_PATH}/redef1.nc - -${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/testfile.nc -rm -f ${OUT_PATH}/redef1.nc -rm -f ${OUT_PATH}/testfile.nc - -# echo "" - -if test "x${ENABLE_BURST_BUFFER}" = x1 ; then - echo "" - echo "---- testing burst buffering" - - # Run using burst buffer driver - export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" - ${TESTSEQRUN} ./put_all_kinds ${TESTOUTDIR}/put_all_kinds.bb.nc - ${TESTSEQRUN} ./iput_all_kinds ${TESTOUTDIR}/iput_all_kinds.bb.nc - unset PNETCDF_HINTS - - # Compare - for i in 1 2 5 ; do - ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/put_all_kinds.nc$i ${TESTOUTDIR}/put_all_kinds.bb.nc$i - ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/iput_all_kinds.nc$i ${TESTOUTDIR}/iput_all_kinds.bb.nc$i - done -fi -rm -f ${OUT_PATH}/put_all_kinds.nc* -rm -f ${OUT_PATH}/put_all_kinds.bb.nc* -rm -f ${OUT_PATH}/iput_all_kinds.nc* -rm -f ${OUT_PATH}/iput_all_kinds.bb.nc* - -# echo "" - -if test "${ENABLE_THREAD_SAFE}" = 1 ; then - # echo "---- testing thread safety" - for j in 0 1 ; do - export PNETCDF_SAFE_MODE=$j - # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" - - ${TESTSEQRUN} ./tst_pthread ${TESTOUTDIR}/tst_pthread.nc - for i in 0 1 2 3 4 5 ; do - ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/tst_pthread.nc.$i - rm -f ${OUT_PATH}/tst_pthread.nc.$i +for j in ${safe_modes} ; do + if test "$j" = 1 ; then # test only in safe mode + SAFE_HINTS="romio_no_indep_rw=true" + else + SAFE_HINTS="romio_no_indep_rw=false" + fi +for mpiio_mode in 0 1 ; do + if test "$mpiio_mode" = 1 ; then + USEMPIO_HINTS="nc_pncio=disable" + else + USEMPIO_HINTS="nc_pncio=enable" + fi + + PNETCDF_HINTS= + if test "x$SAFE_HINTS" != x ; then + PNETCDF_HINTS="$SAFE_HINTS" + fi + if test "x$USEMPIO_HINTS" != x ; then + PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS" + fi + + export PNETCDF_HINTS="$PNETCDF_HINTS" + export PNETCDF_SAFE_MODE=$j + # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS" + + ${TESTSEQRUN} ./tst_version + + ${TESTSEQRUN} ./put_all_kinds ${TESTOUTDIR}/put_all_kinds.nc + ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc1 + ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc2 + ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc5 + + ${TESTSEQRUN} ./iput_all_kinds ${TESTOUTDIR}/iput_all_kinds.nc + ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/iput_all_kinds.nc1 + ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/iput_all_kinds.nc2 + ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/iput_all_kinds.nc5 + + NCMPIGEN=../../src/utils/ncmpigen/ncmpigen + NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff + + # remove the file system type prefix name if there is any. + OUT_PATH=`echo "$TESTOUTDIR" | cut -d: -f2-` + + rm -f ${OUT_PATH}/testfile.nc ${OUT_PATH}/redef1.nc + ${TESTSEQRUN} ${NCMPIGEN} -v 5 -o ${TESTOUTDIR}/redef1.nc ${srcdir}/redef-good.ncdump + ${TESTSEQRUN} ./redef1 ${TESTOUTDIR}/testfile.nc + ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/testfile.nc ${TESTOUTDIR}/redef1.nc + # diff -q ${OUT_PATH}/testfile.nc ${OUT_PATH}/redef1.nc + + ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/testfile.nc + rm -f ${OUT_PATH}/redef1.nc + rm -f ${OUT_PATH}/testfile.nc + + # echo "" + + if test "x${ENABLE_BURST_BUFFER}" = x1 ; then + echo "" + echo "---- testing burst buffering" + + # Run using burst buffer driver + export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable" + ${TESTSEQRUN} ./put_all_kinds ${TESTOUTDIR}/put_all_kinds.bb.nc + ${TESTSEQRUN} ./iput_all_kinds ${TESTOUTDIR}/iput_all_kinds.bb.nc + unset PNETCDF_HINTS + + # Compare + for i in 1 2 5 ; do + ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/put_all_kinds.nc$i ${TESTOUTDIR}/put_all_kinds.bb.nc$i + ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/iput_all_kinds.nc$i ${TESTOUTDIR}/iput_all_kinds.bb.nc$i done - done -fi + fi + rm -f ${OUT_PATH}/put_all_kinds.nc* + rm -f ${OUT_PATH}/put_all_kinds.bb.nc* + rm -f ${OUT_PATH}/iput_all_kinds.nc* + rm -f ${OUT_PATH}/iput_all_kinds.bb.nc* + + # echo "" + + if test "${ENABLE_THREAD_SAFE}" = 1 ; then + # echo "---- testing thread safety" + for j in 0 1 ; do + export PNETCDF_SAFE_MODE=$j + # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}" + + ${TESTSEQRUN} ./tst_pthread ${TESTOUTDIR}/tst_pthread.nc + for i in 0 1 2 3 4 5 ; do + ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/tst_pthread.nc.$i + rm -f ${OUT_PATH}/tst_pthread.nc.$i + done + done + fi +done +done diff --git a/test/testcases/test_vard.c b/test/testcases/test_vard.c index 5ea0ded0f..3fbad06ad 100644 --- a/test/testcases/test_vard.c +++ b/test/testcases/test_vard.c @@ -64,6 +64,7 @@ if (buf[j][i] != val+i) { \ printf("line %d: expecting buf[%d][%d]=%d but got %d\n",__LINE__,j,i,val+i,buf[j][i]); \ nerrs++; \ + goto fn_exit; \ } \ } \ } \ @@ -74,6 +75,7 @@ if (buf[j][i] != rank*100+j*10+i) { \ printf("line %d: expecting buf[%d][%d]=%d but got %d\n",__LINE__,j,i,rank*100+j*10+i,(int)buf[j][i]); \ nerrs++; \ + goto fn_exit; \ } \ } \ } @@ -143,20 +145,23 @@ int get_var_and_verify(int ncid, nerrs++; } } + +fn_exit: free(ncbuf); + return nerrs; } /*----< main() >------------------------------------------------------------*/ int main(int argc, char **argv) { - char filename[256]; + char filename[256], *hint_value; int i, j, err, ncid, varid0, varid1, varid2, dimids[2], nerrs=0; int rank, nprocs, blocklengths[2], **buf, *bufptr; int array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; int buftype_size, expected_put_size, format; - float **flt_buf, *flt_bufptr; - double **dbl_buf, *dbl_bufptr; + float **flt_buf=NULL, *flt_bufptr; + double **dbl_buf=NULL, *dbl_bufptr; MPI_Offset start[2], count[2], header_size, put_size, new_put_size; MPI_Aint a0, a1, disps[2]; MPI_Datatype buftype, ghost_buftype, rec_filetype, fix_filetype; @@ -182,6 +187,19 @@ int main(int argc, char **argv) { free(cmd_str); } + /* Skip test when intra-node aggregation is enabled, as vard APIs are not + * supported. + */ + if (inq_env_hint("nc_num_aggrs_per_node", &hint_value)) { + if (atoi(hint_value) > 0) { + free(hint_value); + if (rank == 0) printf(SKIP_STR); + MPI_Finalize(); + return 0; + } + free(hint_value); + } + /* construct various MPI derived data types */ buf = (int**)malloc(sizeof(int*) * NY); @@ -486,6 +504,7 @@ int main(int argc, char **argv) { } free(schar_buf); +fn_exit: MPI_Type_free(&rec_filetype); MPI_Type_free(&fix_filetype); MPI_Type_free(&buftype); @@ -495,8 +514,14 @@ int main(int argc, char **argv) { free(array_of_blocklengths); free(array_of_displacements); free(buf[0]); free(buf); - free(flt_buf[0]); free(flt_buf); - free(dbl_buf[0]); free(dbl_buf); + if (flt_buf != NULL) { + free(flt_buf[0]); + free(flt_buf); + } + if (dbl_buf != NULL) { + free(dbl_buf[0]); + free(dbl_buf); + } err = ncmpi_close(ncid); CHECK_ERR diff --git a/test/testcases/test_vard_multiple.c b/test/testcases/test_vard_multiple.c index 1e47dbeb4..5e1444942 100644 --- a/test/testcases/test_vard_multiple.c +++ b/test/testcases/test_vard_multiple.c @@ -77,7 +77,7 @@ /*----< main() >------------------------------------------------------------*/ int main(int argc, char **argv) { - char filename[256]; + char filename[256], *hint_value; int i, j, err, ncid, varid[4], dimids[3], nerrs=0, unlimit_dimid; int rank, nprocs, *buf[2]; int array_of_sizes[2], array_of_subsizes[2], array_of_starts[2]; @@ -106,6 +106,19 @@ int main(int argc, char **argv) { free(cmd_str); } + /* Skip test when intra-node aggregation is enabled, as vard APIs are not + * supported. + */ + if (inq_env_hint("nc_num_aggrs_per_node", &hint_value)) { + if (atoi(hint_value) > 0) { + free(hint_value); + if (rank == 0) printf(SKIP_STR); + MPI_Finalize(); + return 0; + } + free(hint_value); + } + buf[0] = (int*)malloc(sizeof(int) * NY * NX); for (j=0; j------------------------------------------------------------*/ int main(int argc, char **argv) { - char filename[256]; + char filename[256], *hint_value; int i, j, err, nerrs=0, ncid, varid, dimids[2], unlimit_dimid; int rank, nprocs, verbose, array_of_blocklengths[2], buf[NY][NX]; MPI_Offset recsize, len; @@ -72,6 +72,19 @@ int main(int argc, char **argv) { free(cmd_str); } + /* Skip test when intra-node aggregation is enabled, as vard APIs are not + * supported. + */ + if (inq_env_hint("nc_num_aggrs_per_node", &hint_value)) { + if (atoi(hint_value) > 0) { + free(hint_value); + if (rank == 0) printf(SKIP_STR); + MPI_Finalize(); + return 0; + } + free(hint_value); + } + /* create a new file for write */ err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL, &ncid); CHECK_ERR diff --git a/test/testcases/test_varm.c b/test/testcases/test_varm.c index 0248ec400..cd84ca595 100644 --- a/test/testcases/test_varm.c +++ b/test/testcases/test_varm.c @@ -14,6 +14,8 @@ #include +static int verbose; + static int check_read_contents(float *rh) { @@ -25,10 +27,9 @@ check_read_contents(float *rh) for (i=0; i<6; i++) { for (j=0; j<4; j++) { if (rh[j*6+i] != k) { -#ifdef PRINT_ERR_ON_SCREEN - printf("Error at %s:%d : expect rh[%d][%d]=%f but got %f\n", - __FILE__,__LINE__,j,i,k,rh[j*6+i]); -#endif + if (verbose) + printf("Error at %s:%d : expect rh[%d][%d]=%f but got %f\n", + __FILE__,__LINE__,j,i,k,rh[j*6+i]); return 1; } k += 1.0; @@ -71,11 +72,10 @@ check_write_contents(signed char *varT) for (j=0; j<4; j++) { for (i=0; i<6; i++) { if (varT[j*6+i] != j*6+i + 50) { -#ifdef PRINT_ERR_ON_SCREEN - /* this error is a pnetcdf internal error, if occurs */ - printf("Error at line %d in %s: expecting varT[%d][%d]=%d but got %d\n", - __LINE__,__FILE__,j,i,j*6+i + 50,varT[j*6+i]); -#endif + if (verbose) + /* this error is a pnetcdf internal error, if occurs */ + printf("Error at line %d in %s: expecting varT[%d][%d]=%d but got %d\n", + __LINE__,__FILE__,j,i,j*6+i + 50,varT[j*6+i]); return 1; } } @@ -97,7 +97,8 @@ tst_fmt(char *filename, int cmode) MPI_Comm_size(MPI_COMM_WORLD, &nprocs); cmode |= NC_CLOBBER; - err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); + CHECK_FATAL_ERR /* define a variable of a 6 x 4 integer array in the nc file */ err = ncmpi_def_dim(ncid, "Y", 6, &dimid[0]); CHECK_ERR @@ -124,7 +125,8 @@ tst_fmt(char *filename, int cmode) err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); + CHECK_FATAL_ERR err = ncmpi_inq_varid(ncid, "var", &varid); CHECK_ERR @@ -177,7 +179,8 @@ tst_fmt(char *filename, int cmode) err = ncmpi_close(ncid); CHECK_ERR - err = ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, &ncid); CHECK_ERR + err = ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, &ncid); + CHECK_FATAL_ERR err = ncmpi_inq_varid(ncid, "var", &varid); CHECK_ERR @@ -186,7 +189,8 @@ tst_fmt(char *filename, int cmode) start[0] = 0; start[1] = 0; count[0] = 6; count[1] = 4; if (rank > 0) count[0] = count[1] = 0; - err = ncmpi_put_vara_int_all(ncid, varid, start, count, &var[0][0]); CHECK_ERR + err = ncmpi_put_vara_int_all(ncid, varid, start, count, &var[0][0]); + CHECK_ERR /* set the contents of the write buffer varT, a 4 x 6 char array 50, 51, 52, 53, 54, 55, @@ -236,6 +240,7 @@ tst_fmt(char *filename, int cmode) err = ncmpi_close(ncid); CHECK_ERR +fn_exit: return nerrs; } @@ -264,6 +269,8 @@ int main(int argc, char **argv) free(cmd_str); } + verbose = 1; + #ifdef DEBUG if (nprocs > 1 && rank == 0) printf("Warning: %s is designed to run on 1 process\n", argv[0]); diff --git a/test/testcases/tst_varn_var1.c b/test/testcases/tst_varn_var1.c new file mode 100644 index 000000000..5217f7a3a --- /dev/null +++ b/test/testcases/tst_varn_var1.c @@ -0,0 +1,217 @@ +/********************************************************************* + * + * Copyright (C) 2025, Northwestern University and Argonne National Laboratory + * See COPYRIGHT notice in top-level directory. + * + *********************************************************************/ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * + * This example tests a single call of ncmpi_put_varn_int_all() to write a + * sequence of requests with arbitrary array indices, all with length == 1. + * + * The compile and run commands are given below, together with an ncmpidump of + * the output file. + * + * % mpicc -O2 -o tst_varn_var1 tst_varn_var1.c -lpnetcdf + * % mpiexec -n 4 ./tst_varn_var1 /pvfs2/wkliao/testfile.nc + * % ncmpidump /pvfs2/wkliao/testfile.nc + * netcdf testfile { + * // file format: CDF-5 (big variables) + * dimensions: + * Y = 4 ; + * X = 10 ; + * time = UNLIMITED ; // (4 currently) + * variables: + * int fix_var(Y, X) ; + * int rec_var(time, X) ; + * data: + * + * fix_var = + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _ ; + * + * rec_var = + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _, + * 0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _ ; + * } + * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +#include +#include +#include /* strcpy(), memset() */ +#include /* basename() */ +#include +#include + +#include + +#define NY 4 +#define NX 4 +#define NDIMS 2 + +int main(int argc, char** argv) +{ + char filename[256]; + int i, j, k, rank, nprocs, err, nerrs=0; + int ncid, cmode, varid[2], dimid[2], nreqs, req, *buf; + MPI_Offset **starts=NULL; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &nprocs); + + if (argc > 2) { + if (!rank) printf("Usage: %s [filename]\n",argv[0]); + MPI_Finalize(); + return 1; + } + if (argc == 2) snprintf(filename, 256, "%s", argv[1]); + else strcpy(filename, "testfile.nc"); + MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD); + + if (rank == 0) { + char *cmd_str = (char*)malloc(strlen(argv[0]) + 256); + sprintf(cmd_str, "*** TESTING C %s for ncmpi_put_varn_int_all() ", basename(argv[0])); + printf("%-66s ------ ", cmd_str); fflush(stdout); + free(cmd_str); + } + + buf = (int*) malloc(sizeof(int) * NY * NX); + + nreqs = NY * NX * nprocs; + starts = (MPI_Offset**) malloc(sizeof(MPI_Offset*) * nreqs); + starts[0] = (MPI_Offset*) calloc(nreqs * NDIMS, sizeof(MPI_Offset)); + for (i=1; i 0) + printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n", + sum_size); + if (malloc_size > 0) ncmpi_inq_malloc_list(); + } + + MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + if (rank == 0) { + if (nerrs) printf(FAIL_STR,nerrs); + else printf(PASS_STR); + } + + MPI_Finalize(); + return (nerrs > 0); +} + diff --git a/test/testcases/tst_version.c b/test/testcases/tst_version.c index eee7d65fb..74c4e73bc 100644 --- a/test/testcases/tst_version.c +++ b/test/testcases/tst_version.c @@ -2,7 +2,7 @@ * Copyright (C) 2019, Northwestern University and Argonne National Laboratory * See COPYRIGHT notice in top-level directory. * - * Check whether PnetCDF version string returned from ncmpi_inq_libvers() + * Check whether PnetCDF version string returned from ncmpi_inq_libvers() * matches the constant PNETCDF_VERSION defined in header file pnetcdf.h. * */ diff --git a/test/testcases/varn_int.c b/test/testcases/varn_int.c index f82469086..4f5b2d045 100644 --- a/test/testcases/varn_int.c +++ b/test/testcases/varn_int.c @@ -23,11 +23,11 @@ * X = 10 ; * REC_DIM = UNLIMITED ; // (4 currently) * variables: - * int var(Y, X) ; + * int fix_var(Y, X) ; * int rec_var(REC_DIM, X) ; * data: * - * var = + * fix_var = * 13, 13, 13, 11, 11, 10, 10, 12, 11, 11, * 10, 12, 12, 12, 13, 11, 11, 12, 12, 12, * 11, 11, 12, 13, 13, 13, 10, 10, 11, 11, @@ -91,7 +91,7 @@ int main(int argc, char** argv) { char filename[256]; int i, j, rank, nprocs, err, nerrs=0; - int ncid, cmode, varid[3], dimid[2], num_reqs, *buffer, *r_buffer; + int ncid, cmode, varid[2], dimid[2], num_reqs, *buffer, *r_buffer; MPI_Offset w_len, **starts=NULL, **counts=NULL; MPI_Init(&argc, &argv); @@ -129,7 +129,7 @@ int main(int argc, char** argv) CHECK_ERR err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]); CHECK_ERR - err = ncmpi_def_var(ncid, "var", NC_INT, NDIMS, dimid, &varid[0]); + err = ncmpi_def_var(ncid, "fix_var", NC_INT, NDIMS, dimid, &varid[0]); CHECK_ERR err = ncmpi_def_dim(ncid, "REC_DIM", NC_UNLIMITED, &dimid[0]); CHECK_ERR @@ -290,7 +290,7 @@ int main(int argc, char** argv) for (i=0; i