diff --git a/.github/workflows/mac_mpich.yml b/.github/workflows/mac_mpich.yml
index 8bab7aed4..72ca9c903 100644
--- a/.github/workflows/mac_mpich.yml
+++ b/.github/workflows/mac_mpich.yml
@@ -142,6 +142,42 @@ jobs:
           run: |
             cd ${GITHUB_WORKSPACE}
             make ptests
+        - name: Build PnetCDF (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}"
+            export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}"
+            make distclean
+            rm -rf pnetcdf_output
+            mkdir -p pnetcdf_output
+            ./configure --disable-fortran \
+                        --with-mpi=${GITHUB_WORKSPACE}/MPICH \
+                        TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output
+            make -j 8 tests
+        - name: Print config.log (default configuration)
+          if: ${{ always() }}
+          run: |
+            cat ${GITHUB_WORKSPACE}/config.log
+        - name: make check (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            make check
+        - name: Print test log files (default configuration)
+          if: ${{ always() }}
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            fname=`find src test examples benchmarks -type f -name "*.log"`
+            for f in $fname ; do \
+               bname=`basename $f` ; \
+               if test "x$bname" != xconfig.log ; then \
+                  echo "-------- dump $f ----------------------------" ; \
+                  cat $f ; \
+               fi ; \
+            done
+        - name: make ptests (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            make ptests
         - name: make distcheck
           run: |
             cd ${GITHUB_WORKSPACE}
diff --git a/.github/workflows/mac_openmpi.yml b/.github/workflows/mac_openmpi.yml
index 7cdf0b2c1..65fcb10be 100644
--- a/.github/workflows/mac_openmpi.yml
+++ b/.github/workflows/mac_openmpi.yml
@@ -144,6 +144,42 @@ jobs:
           run: |
             cd ${GITHUB_WORKSPACE}
             make ptests
+        - name: Build PnetCDF (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}"
+            export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}"
+            make distclean
+            rm -rf  pnetcdf_output
+            mkdir -p pnetcdf_output
+            ./configure --disable-fortran \
+                        --with-mpi=${GITHUB_WORKSPACE}/OPENMPI \
+                        TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output
+            make -j 8 tests
+        - name: Print config.log (default configuration)
+          if: ${{ always() }}
+          run: |
+            cat ${GITHUB_WORKSPACE}/config.log
+        - name: make check (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            make check
+        - name: Print test log files (default configuration)
+          if: ${{ always() }}
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            fname=`find src test examples benchmarks -type f -name "*.log"`
+            for f in $fname ; do \
+               bname=`basename $f` ; \
+               if test "x$bname" != xconfig.log ; then \
+                  echo "-------- dump $f ----------------------------" ; \
+                  cat $f ; \
+               fi ; \
+            done
+        - name: make ptests (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            make ptests
         - name: make distcheck
           run: |
             cd ${GITHUB_WORKSPACE}
diff --git a/.github/workflows/ubuntu_mpich.yml b/.github/workflows/ubuntu_mpich.yml
index 68aae837f..09d626f8f 100644
--- a/.github/workflows/ubuntu_mpich.yml
+++ b/.github/workflows/ubuntu_mpich.yml
@@ -154,6 +154,39 @@ jobs:
           run: |
             cd ${GITHUB_WORKSPACE}
             make ptests
+        - name: Build PnetCDF (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}"
+            export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}"
+            make distclean
+            ./configure --prefix=${GITHUB_WORKSPACE}/PnetCDF \
+                        --with-mpi=${GITHUB_WORKSPACE}/MPICH
+            make -j 8 tests
+        - name: Print config.log (default configuration)
+          if: ${{ always() }}
+          run: |
+            cat ${GITHUB_WORKSPACE}/config.log
+        - name: make check (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            make check
+        - name: Print test log files (default configuration)
+          if: ${{ always() }}
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            fname=`find src test examples benchmarks -type f -name "*.log"`
+            for f in $fname ; do \
+               bname=`basename $f` ; \
+               if test "x$bname" != xconfig.log ; then \
+                  echo "-------- dump $f ----------------------------" ; \
+                  cat $f ; \
+               fi ; \
+            done
+        - name: make ptests (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            make ptests
         - name: make distcheck
           run: |
             cd ${GITHUB_WORKSPACE}
diff --git a/.github/workflows/ubuntu_openmpi.yml b/.github/workflows/ubuntu_openmpi.yml
index b995f70d1..80f087295 100644
--- a/.github/workflows/ubuntu_openmpi.yml
+++ b/.github/workflows/ubuntu_openmpi.yml
@@ -150,6 +150,41 @@ jobs:
           run: |
             cd ${GITHUB_WORKSPACE}
             make ptests
+        - name: Build PnetCDF (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            export PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/bin:${PATH}"
+            export LD_LIBRARY_PATH="${GITHUB_WORKSPACE}/AUTOTOOLS/lib:${LD_LIBRARY_PATH}"
+            make distclean
+            mkdir -p pnetcdf_output
+            ./configure --prefix=${GITHUB_WORKSPACE}/PnetCDF \
+                        --with-mpi=${GITHUB_WORKSPACE}/OPENMPI \
+                        TESTOUTDIR=${GITHUB_WORKSPACE}/pnetcdf_output
+            make -j 8 tests
+        - name: Print config.log (default configuration)
+          if: ${{ always() }}
+          run: |
+            cat ${GITHUB_WORKSPACE}/config.log
+        - name: make check (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            make check
+        - name: Print test log files (default configuration)
+          if: ${{ always() }}
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            fname=`find src test examples benchmarks -type f -name "*.log"`
+            for f in $fname ; do \
+               bname=`basename $f` ; \
+               if test "x$bname" != xconfig.log ; then \
+                  echo "-------- dump $f ----------------------------" ; \
+                  cat $f ; \
+               fi ; \
+            done
+        - name: make ptests (default configuration)
+          run: |
+            cd ${GITHUB_WORKSPACE}
+            make ptests
         - name: make distcheck
           run: |
             cd ${GITHUB_WORKSPACE}
diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md
index f1b3693a6..0ca204b83 100644
--- a/DEVELOPER_NOTES.md
+++ b/DEVELOPER_NOTES.md
@@ -149,12 +149,17 @@
  10. Generate SHA1 checksums
     * Run command:
       ```
-      openssl sha1 pnetcdf-1.11.0.tar.gz`
+      openssl sha1 pnetcdf-1.11.0.tar.gz
       ```
     * Example command-line output:
       ```
       SHA1(pnetcdf-1.11.0.tar.gz)= 495d42f0a41abbd09d276262dce0f7c1c535968a
       ```
+    * Or use SHA 256
+      ```
+      sha256sum pnetcdf-1.11.0.tar.gz
+      a18a1a43e6c4fd7ef5827dbe90e9dcf1363b758f513af1f1356ed6c651195a9f pnetcdf-1.11.0.tar.gz
+      ```
 11. Update PnetCDF Web Page
     * https://github.com/Parallel-NetCDF/Parallel-NetCDF.github.io
     * Create a new file of release note Parallel-NetCDF.github.io/Release_notes/1.11.0.md.
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index c0e031f48..0007f5d68 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -6,6 +6,44 @@ PnetCDF Release Notes
 Version _PNETCDF_VERSION_ (_PNETCDF_RELEASE_DATE_)
 -------------------------------------
 
+* New feature
+  + Intra-node aggregation for read requests is added. This is the complement
+    of the write requests first implemented in version 1.14.0. Now intra-node
+    aggregation supports both write and read operations. This feature can be
+    enabled by setting hint "nc_num_aggrs_per_node" to the desired number of
+    aggregators per compute node.
+
+* New optimization
+  + A new internal I/O driver, named "pncio", is added which implements several
+    strategies for performance improvement. A significant portion of this
+    driver was developed to improve performance when Lustre is used. It
+    includes the followings.
+    * When creating a new file, it try to set the Lustre file striping count
+      to the number of compute nodes allocated to the MPI communicator passed
+      to "ncmpi_create()", when I/O hint "striping_factor" is not explicitly
+      set by the applications.
+    * It automatically sets a good value for hint "cb_nodes" when it is not
+      explicitly set by the applications.
+
+* API deprecated
+  + "vard" APIs introduced in version 1.6.0 are now deprecated. These are the
+    API family that take an argument of MPI derived data type describing the
+    file access layout, which is used as the fileview by the underlying MPI
+    library.
+
+* New error code
+  + "NC_EFSTYPE" indicates an error when an invalid file system type is
+    detected.
+
+* New PnetCDF hint
+  + "nc_pncio" -- To disable or enable the use of the internal "pncio" driver.
+    Its string value is either "enable" or "disable". The default is "enable".
+
+
+-------------------------------------
+Version 1.14.1 (July 31, 2025)
+-------------------------------------
+
 * New optimization
   + When file header extent size grows, moving the data section to a higher
     file offset has changed to be done in chunks of 16 MB per process.
diff --git a/benchmarks/C/Makefile.am b/benchmarks/C/Makefile.am
index 333176cbf..e45408dcd 100644
--- a/benchmarks/C/Makefile.am
+++ b/benchmarks/C/Makefile.am
@@ -40,14 +40,30 @@ CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \
 # be used to compare NetCDF4 performance against PnetCDF.
 EXTRA_DIST = parallel_run.sh netcdf_put_vara.c
 
-ptest ptests ptest4: $(check_PROGRAMS)
+ptest ptest4: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
 	$(srcdir)/parallel_run.sh 4 || exit 1
 
-ptest2 ptest6 ptest8 ptest10:
+ptest2: $(check_PROGRAMS)
+	@echo "==========================================================="
+	@echo "    $(subdir): Parallel testing on 2 MPI processes"
+	@echo "==========================================================="
+	@$(TESTS_ENVIRONMENT) \
+	$(srcdir)/parallel_run.sh 2 || exit 1
+
+ptest10: $(check_PROGRAMS)
+	@echo "==========================================================="
+	@echo "    $(subdir): Parallel testing on 10 MPI processes"
+	@echo "==========================================================="
+	@$(TESTS_ENVIRONMENT) \
+	$(srcdir)/parallel_run.sh 10 || exit 1
+
+ptest6 ptest8:
+
+ptests: ptest2 ptest4 ptest10
 
 # build check targets but not invoke
 tests-local: all $(check_PROGRAMS)
diff --git a/benchmarks/C/parallel_run.sh b/benchmarks/C/parallel_run.sh
index be9212db8..8f2593030 100755
--- a/benchmarks/C/parallel_run.sh
+++ b/benchmarks/C/parallel_run.sh
@@ -10,15 +10,18 @@ set -e
 VALIDATOR=../../src/utils/ncvalidator/ncvalidator
 NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff
 
-# remove file system type prefix if there is any
-OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-`
-
 MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "MPIRUN = ${MPIRUN}"
 # echo "check_PROGRAMS=${check_PROGRAMS}"
 
+# remove file system type prefix if there is any
+OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-`
+
+# let NTHREADS=$1*6-1
+NTHREADS=`expr $1 \* 6 - 1`
+
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -27,60 +30,112 @@ fi
 # prevent user environment setting of PNETCDF_HINTS to interfere
 unset PNETCDF_HINTS
 
+fixed_length=23
+
 for i in ${check_PROGRAMS} ; do
+
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+           safe_hint="  SAFE"
         else
-           export PNETCDF_HINTS=
+           SAFE_HINTS="romio_no_indep_rw=false"
+           safe_hint="NOSAFE"
         fi
+        OUT_PREFIX="${TESTOUTDIR}/$i"
+
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.mpio"
+           driver_hint=" MPIO"
+        else
+           USEMPIO_HINTS="nc_pncio=enable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.pncio"
+           driver_hint="PNCIO"
+        fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}.ina"
+           ina_hint="  INA"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}"
+           ina_hint="NOINA"
+        fi
+
+        OUT_FILE=$INA_OUT_FILE
+        TEST_OPTS="$safe_hint $driver_hint $ina_hint"
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
         fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+        # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
 
-        OPTS=
+        CMD_OPTS="-q -l 10"
         if test "$i" = "aggregation" ; then
-           OPTS="-b -c -i -j"
+           CMD_OPTS+=" -b -c -i -j"
         fi
-        # echo "${MPIRUN} ./$i -q ${OPTS} -l 10 ${TESTOUTDIR}/$i.nc"
-        ${MPIRUN} ./$i -q ${OPTS} -l 10 ${TESTOUTDIR}/$i.nc
+
+        # echo "${LINENO}: ${MPIRUN} ./$i $CMD_OPTS ${OUT_FILE}.nc"
+        ${MPIRUN} ./$i $CMD_OPTS ${OUT_FILE}.nc
+
         if test $? = 0 ; then
-           echo "PASS:  C  parallel run on $1 processes --------------- $i"
+           printf "PASS:  C  nprocs=$1 %-${fixed_length}s   -------- $i\n" "$TEST_OPTS"
         fi
 
-        # echo "--- validating file ${TESTOUTDIR}/$i.nc"
-        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc
-        # echo ""
+        # echo "${LINENO}:--- validating file ${OUT_FILE}.nc"
+        ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc
 
         if test "x${ENABLE_BURST_BUFFER}" = x1 ; then
-           # echo "test burst buffering feature"
+           # echo "---- test burst buffering feature"
            saved_PNETCDF_HINTS=${PNETCDF_HINTS}
            export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
-           ${MPIRUN} ./$i -q -l 10 ${TESTOUTDIR}/$i.bb.nc
+           # echo "${LINENO}: ${MPIRUN} ./$i ${OUT_FILE}.bb.nc"
+           ${MPIRUN} ./$i $CMD_OPTS ${OUT_FILE}.bb.nc
+
            if test $? = 0 ; then
-              echo "PASS:  C  parallel run on $1 processes --------------- $i"
+              printf "PASS:  C  nprocs=$1 %-${fixed_length}s   -------- $i\n" "$TEST_OPTS BB"
            fi
+
            export PNETCDF_HINTS=${saved_PNETCDF_HINTS}
 
-           # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc"
-           ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc
+           # echo "${LINENO}: --- validating file ${OUT_FILE}.bb.nc"
+           ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc
 
-           # echo "--- ncmpidiff $i.nc $i.bb.nc ---"
-           ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc
+           DIFF_OPT="-q"
+           # echo "${LINENO}: --- ncmpidiff $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc ---"
+           ${MPIRUN} ${NCMPIDIFF} $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc
         fi
 
-       if test "x${ENABLE_NETCDF4}" = x1 ; then
-          # echo "test netCDF-4 feature"
-          ${MPIRUN} ./$i -q -l 10 ${TESTOUTDIR}/$i.nc4 4
-          # Validator does not support nc4
-       fi
-    done
-    done
-    rm -f ${OUTDIR}/$i.nc
-    rm -f ${OUTDIR}/$i.bb.nc
-    rm -f ${OUTDIR}/$i.nc4
-done
+        if test "x${ENABLE_NETCDF4}" = x1 ; then
+           # echo "${LINENO}: test netCDF-4 feature"
+           ${MPIRUN} ./$i $CMD_OPTS ${OUT_FILE}.nc4 4
+           # Validator does not support nc4
+        fi
+    done # intra_aggr
+    done # mpiio_mode
+
+    DIFF_OPT="-q"
+    # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc ---"
+    $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc
+    # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc ---"
+    $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc
+    # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc ---"
+    $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc
+
+    done # safe_modes
+    rm -f ${OUTDIR}/$i*nc*
+done # check_PROGRAMS
 
diff --git a/benchmarks/C/write_block_read_column.c b/benchmarks/C/write_block_read_column.c
index 4c129794c..5048783f4 100644
--- a/benchmarks/C/write_block_read_column.c
+++ b/benchmarks/C/write_block_read_column.c
@@ -42,6 +42,8 @@
 
 #define NVARS 4
 
+static int verbose;
+
 #define ERR(e) {if((e)!=NC_NOERR){printf("Error at line=%d: %s\n", __LINE__, ncmpi_strerror(e));nerrs++;}}
 
 /*----< print_info() >------------------------------------------------------*/
@@ -124,6 +126,8 @@ int benchmark_write(char       *filename,
     psizes[0] = psizes[1] = 0;
     MPI_Dims_create(nprocs, 2, psizes);
 
+    if (verbose && rank == 0) printf("psizes = %d %d\n",psizes[0],psizes[1]);
+
     gsizes[0] = len * psizes[0];
     gsizes[1] = len * psizes[1];
 
@@ -157,11 +161,14 @@ int benchmark_write(char       *filename,
     timing[2] = end_t - start_t;
     start_t = end_t;
 
-    start[0] = len * (rank % psizes[0]);
-    start[1] = len * ((rank / psizes[1]) % psizes[1]);
+    start[0] = len * (rank / psizes[1]);
+    start[1] = len * (rank % psizes[1]);
     count[0] = len;
     count[1] = len;
 
+    if (verbose)
+        printf("%d: start=%lld %lld count=%lld %lld\n",rank,start[0],start[1],count[0],count[1]);
+
     for (i=0; i<NVARS; i++) {
         if (i % 4 == 0) {
             err = ncmpi_put_vara_int_all(ncid, varid[i], start, count, (int*)buf[i]);
@@ -272,6 +279,8 @@ int benchmark_read(char       *filename,
     timing[2] = end_t - start_t;
     start_t = end_t;
 
+    if (verbose) printf("%d: start=%lld %lld count=%lld %lld\n",rank,start[0],start[1],count[0],count[1]);
+
     for (i=0; i<nvars; i++) {
         if (i % 4 == 0) {
             err = ncmpi_get_vara_int_all(ncid, varid[i], start, count, (int*)buf[i]);
@@ -331,7 +340,7 @@ int main(int argc, char** argv) {
     extern int optind;
     extern char *optarg;
     char filename[256];
-    int i, rank, nprocs, verbose=1, nerrs=0;
+    int i, rank, nprocs, nerrs=0;
     double timing[10], max_t[10];
     MPI_Offset len=0, w_size, r_size, sum_w_size, sum_r_size;
     MPI_Comm comm=MPI_COMM_WORLD;
@@ -342,6 +351,7 @@ int main(int argc, char** argv) {
     MPI_Comm_size(comm, &nprocs);
 
     /* get command-line arguments */
+    verbose = 1;
     while ((i = getopt(argc, argv, "hql:")) != EOF)
         switch(i) {
             case 'q': verbose = 0;
diff --git a/benchmarks/FLASH-IO/Makefile.am b/benchmarks/FLASH-IO/Makefile.am
index 986b24a11..bac04d103 100644
--- a/benchmarks/FLASH-IO/Makefile.am
+++ b/benchmarks/FLASH-IO/Makefile.am
@@ -91,14 +91,30 @@ if BUILD_BENCHMARKS_IN_PNETCDF
 endif
 
 
-ptest ptests ptest4: $(check_PROGRAMS)
+ptest ptest4: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
 	$(srcdir)/parallel_run.sh 4 || exit 1
 
-ptest2 ptest6 ptest8 ptest10:
+ptest2: $(check_PROGRAMS)
+	@echo "==========================================================="
+	@echo "    $(subdir): Parallel testing on 2 MPI processes"
+	@echo "==========================================================="
+	@$(TESTS_ENVIRONMENT) \
+	$(srcdir)/parallel_run.sh 2 || exit 1
+
+ptest10: $(check_PROGRAMS)
+	@echo "==========================================================="
+	@echo "    $(subdir): Parallel testing on 10 MPI processes"
+	@echo "==========================================================="
+	@$(TESTS_ENVIRONMENT) \
+	$(srcdir)/parallel_run.sh 10 || exit 1
+
+ptest6 ptest8:
+
+ptests: ptest2 ptest4 ptest10
 
 # build check targets but not invoke
 tests-local: all $(check_PROGRAMS)
diff --git a/benchmarks/FLASH-IO/parallel_run.sh b/benchmarks/FLASH-IO/parallel_run.sh
index 7577ec3a4..d642ff000 100755
--- a/benchmarks/FLASH-IO/parallel_run.sh
+++ b/benchmarks/FLASH-IO/parallel_run.sh
@@ -10,13 +10,16 @@ set -e
 VALIDATOR=../../src/utils/ncvalidator/ncvalidator
 NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff
 
-# remove file system type prefix if there is any
-OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-`
-
 MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "MPIRUN = ${MPIRUN}"
 # echo "check_PROGRAMS=${check_PROGRAMS}"
 
+# remove file system type prefix if there is any
+OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-`
+
+# let NTHREADS=$1*6-1
+NTHREADS=`expr $1 \* 6 - 1`
+
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
 if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
@@ -27,63 +30,119 @@ fi
 # prevent user environment setting of PNETCDF_HINTS to interfere
 unset PNETCDF_HINTS
 
+FILE_EXTS="ncmpi_chk_0000 ncmpi_plt_cnt_0000 ncmpi_plt_crn_0000"
+
+fixed_length=23
+
 for i in ${check_PROGRAMS} ; do
+
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+           safe_hint="  SAFE"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+           safe_hint="NOSAFE"
+        fi
+        OUT_PREFIX="${TESTOUTDIR}/$i"
+
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.mpio"
+           driver_hint=" MPIO"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.pncio"
+           driver_hint="PNCIO"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}.ina"
+           ina_hint="  INA"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}"
+           ina_hint="NOINA"
+        fi
+
+        OUT_FILE=$INA_OUT_FILE
+        TEST_OPTS="$safe_hint $driver_hint $ina_hint"
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
         fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+        # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
+        CMD_OPTS="-q -f ${OUT_FILE}."
 
-        ${MPIRUN} ./$i -q -f ${TESTOUTDIR}/$i.
+        # echo "${LINENO}: ${MPIRUN} ./$i $CMD_OPTS"
+        ${MPIRUN} ./$i $CMD_OPTS
 
         if test $? = 0 ; then
-           echo "PASS: F90 parallel run on $1 processes --------------- $i"
+           printf "PASS: F90 nprocs=$1 %-${fixed_length}s   -------- $i\n" "$TEST_OPTS"
         fi
 
         if test "x${BUILD_BENCHMARKS_IN_PNETCDF}" != x1 ; then
            continue
         fi
 
-        # echo "--- validating file ${TESTOUTDIR}/$i.ncmpi_chk_0000.nc"
-        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.ncmpi_chk_0000.nc
-        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.ncmpi_plt_cnt_0000.nc
-        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.ncmpi_plt_crn_0000.nc
-        # echo ""
+        for ext in $FILE_EXTS ; do
+           # echo "${LINENO}:--- validating file $OUT_FILE.$ext.nc"
+           ${TESTSEQRUN} ${VALIDATOR} -q $OUT_FILE.$ext.nc
+        done
 
         if test "x${ENABLE_BURST_BUFFER}" = x1 ; then
-           # echo "test burst buffering feature"
+           # echo "---- test burst buffering feature"
            saved_PNETCDF_HINTS=${PNETCDF_HINTS}
            export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
-           ${MPIRUN} ./$i -q -f ${TESTOUTDIR}/$i.bb.
+           CMD_OPTS="-q -f ${OUT_FILE}.bb."
+           # echo "${LINENO}: ${MPIRUN} ./$i $CMD_OPTS"
+           ${MPIRUN} ./$i $CMD_OPTS
+
            if test $? = 0 ; then
-              echo "PASS: F90 parallel run on $1 processes --------------- $i"
+              printf "PASS: F90 nprocs=$1 %-${fixed_length}s   -------- $i\n" "$TEST_OPTS BB"
            fi
+
            export PNETCDF_HINTS=${saved_PNETCDF_HINTS}
 
-           # echo "--- validating file ${TESTOUTDIR}/$i.bb.ncmpi_chk_0000.nc"
-           ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.ncmpi_chk_0000.nc
-           ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.ncmpi_plt_cnt_0000.nc
-           ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.ncmpi_plt_crn_0000.nc
+           for ext in $FILE_EXTS ; do
+              # echo "${LINENO}: --- validating file ${OUT_FILE}.bb.nc"
+              ${TESTSEQRUN} ${VALIDATOR} -q $OUT_FILE.bb.$ext.nc
+              # echo "${LINENO}: --- ncmpidiff -q $OUT_FILE.$ext.nc $OUT_FILE.bb.$ext.nc ---"
+              ${MPIRUN} ${NCMPIDIFF} -q $OUT_FILE.$ext.nc $OUT_FILE.bb.$ext.nc
+           done
+        fi
 
-           # echo "--- ncmpidiff $i.ncmpi_chk_0000.nc $i.bb.ncmpi_chk_0000.nc ---"
-           ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.ncmpi_chk_0000.nc ${TESTOUTDIR}/$i.bb.ncmpi_chk_0000.nc
-           ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.ncmpi_plt_cnt_0000.nc ${TESTOUTDIR}/$i.bb.ncmpi_plt_cnt_0000.nc
-           ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.ncmpi_plt_crn_0000.nc ${TESTOUTDIR}/$i.bb.ncmpi_plt_crn_0000.nc
+        if test "x${ENABLE_NETCDF4}" = x1 ; then
+           # echo "${LINENO}: test netCDF-4 feature"
+           ${MPIRUN} ./$i $CMD_OPTS ${OUT_FILE}.nc4 4
+           # Validator does not support nc4
         fi
-    done
-    done
-    rm -f ${OUTDIR}/$i.ncmpi_chk_0000.nc
-    rm -f ${OUTDIR}/$i.ncmpi_plt_cnt_0000.nc
-    rm -f ${OUTDIR}/$i.ncmpi_plt_crn_0000.nc
-    rm -f ${OUTDIR}/$i.bb.ncmpi_chk_0000.nc
-    rm -f ${OUTDIR}/$i.bb.ncmpi_plt_cnt_0000.nc
-    rm -f ${OUTDIR}/$i.bb.ncmpi_plt_crn_0000.nc
-done
+    done # intra_aggr
+    done # mpiio_mode
+
+    for ext in $FILE_EXTS ; do
+       # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.$ext.nc $OUT_PREFIX.mpio.ina.$ext.nc ---"
+       $MPIRUN $NCMPIDIFF -q $OUT_PREFIX.mpio.$ext.nc $OUT_PREFIX.mpio.ina.$ext.nc
+       # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.$ext.nc $OUT_PREFIX.pncio.$ext.nc ---"
+       $MPIRUN $NCMPIDIFF -q $OUT_PREFIX.mpio.$ext.nc $OUT_PREFIX.pncio.$ext.nc
+       # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.$ext.nc $OUT_PREFIX.pncio.ina.$ext.nc ---"
+       $MPIRUN $NCMPIDIFF -q $OUT_PREFIX.mpio.$ext.nc $OUT_PREFIX.pncio.ina.$ext.nc
+    done # ext
+
+    done # safe_modes
+    rm -f ${OUTDIR}/$i*nc*
+done # check_PROGRAMS
 
diff --git a/benchmarks/Makefile.am b/benchmarks/Makefile.am
index 8a13127c1..218cdf045 100644
--- a/benchmarks/Makefile.am
+++ b/benchmarks/Makefile.am
@@ -29,11 +29,16 @@ else
    PTEST_SUBDIRS = $(SUBDIRS)
 endif
 
-ptest ptests:
+ptest:
 	@for d in $(PTEST_SUBDIRS) ; do \
 		$(MAKE) $(MFLAGS) -C $$d ptest $$* || exit 1 ; \
 	done
 
+ptests:
+	@for d in $(PTEST_SUBDIRS) ; do \
+		$(MAKE) $(MFLAGS) -C $$d ptests $$* || exit 1 ; \
+	done
+
 # For VPATH build (parallel build), try delete all sub-directories
 distclean-local:
 	@for d in $(DIST_SUBDIRS) ; do \
diff --git a/benchmarks/WRF-IO/Makefile.am b/benchmarks/WRF-IO/Makefile.am
index 55a924ce7..756e42474 100644
--- a/benchmarks/WRF-IO/Makefile.am
+++ b/benchmarks/WRF-IO/Makefile.am
@@ -76,6 +76,8 @@ ptest16: $(check_PROGRAMS)
 	@$(TESTS_ENVIRONMENT) \
 	$(srcdir)/parallel_run.sh 16 || exit 1
 
+ptests: ptest2 ptest4 ptest10
+
 # build check targets but not invoke
 tests-local: all $(check_PROGRAMS)
 
diff --git a/benchmarks/WRF-IO/parallel_run.sh b/benchmarks/WRF-IO/parallel_run.sh
index 21bb597ac..5f5f0a1ca 100755
--- a/benchmarks/WRF-IO/parallel_run.sh
+++ b/benchmarks/WRF-IO/parallel_run.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+# Copyright (C) 2018, Northwestern University and Argonne National Laboratory
 # See COPYRIGHT notice in top-level directory.
 #
 
@@ -10,15 +10,18 @@ set -e
 VALIDATOR=../../src/utils/ncvalidator/ncvalidator
 NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff
 
-# remove file system type prefix if there is any
-OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-`
-
 MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "MPIRUN = ${MPIRUN}"
 # echo "check_PROGRAMS=${check_PROGRAMS}"
 
+# remove file system type prefix if there is any
+OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-`
+
+# let NTHREADS=$1*6-1
+NTHREADS=`expr $1 \* 6 - 1`
+
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -27,37 +30,108 @@ fi
 # prevent user environment setting of PNETCDF_HINTS to interfere
 unset PNETCDF_HINTS
 
+fixed_length=23
+
 for i in ${check_PROGRAMS} ; do
+
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+           safe_hint="  SAFE"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+           safe_hint="NOSAFE"
+        fi
+        OUT_PREFIX="${TESTOUTDIR}/$i"
+
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.mpio"
+           driver_hint=" MPIO"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.pncio"
+           driver_hint="PNCIO"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}.ina"
+           ina_hint="  INA"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}"
+           ina_hint="NOINA"
         fi
-        # echo "PNETCDF_HINTS=${PNETCDF_HINTS}"
 
+        OUT_FILE=$INA_OUT_FILE
+        TEST_OPTS="$safe_hint $driver_hint $ina_hint"
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+        # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
+        CMD_OPTS="-q -y 100 -x 100 -i ${srcdir}/wrf_header.txt"
+        # echo "${LINENO}: ${MPIRUN} ./$i $CMD_OPTS -w $OUT_FILE.nc -r $OUT_FILE.nc"
+        ${MPIRUN} ./$i $CMD_OPTS -w $OUT_FILE.nc -r $OUT_FILE.nc
 
-        OPTS="-y 100 -x 100 -i ${srcdir}/wrf_header.txt"
-        OPTS="$OPTS -w ${TESTOUTDIR}/$i.nc -r ${TESTOUTDIR}/$i.nc"
-        # echo "${MPIRUN} ./$i -q ${OPTS}"
-        ${MPIRUN} ./$i -q ${OPTS}
         if test $? = 0 ; then
-           echo "PASS:  C  parallel run on $1 processes --------------- $i"
-        fi
-
-        unset PNETCDF_HINTS
-        # echo "--- validating file ${TESTOUTDIR}/$i.nc"
-        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc
-        # echo ""
-    done
-    done
-    rm -f ${OUTDIR}/$i.nc
-    rm -f ${OUTDIR}/$i.nc4
-done
+           printf "PASS:  C  nprocs=$1 %-${fixed_length}s   -------- $i\n" "$TEST_OPTS"
+        fi
+
+        # echo "${LINENO}:--- validating file ${OUT_FILE}.nc"
+        ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc
+
+        if test "x${ENABLE_BURST_BUFFER}" = x1 ; then
+           # echo "---- test burst buffering feature"
+           saved_PNETCDF_HINTS=${PNETCDF_HINTS}
+           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+           # echo "${LINENO}: ${MPIRUN} ./$i $CMD_OPTS -w $OUT_FILE.bb.nc -r $OUT_FILE.bb.nc"
+           ${MPIRUN} ./$i $CMD_OPTS -w $OUT_FILE.bb.nc -r $OUT_FILE.bb.nc
+
+           if test $? = 0 ; then
+              printf "PASS:  C  nprocs=$1 %-${fixed_length}s   -------- $i\n" "$TEST_OPTS BB"
+           fi
+
+           export PNETCDF_HINTS=${saved_PNETCDF_HINTS}
+
+           # echo "${LINENO}: --- validating file ${OUT_FILE}.bb.nc"
+           ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc
+
+           DIFF_OPT="-q"
+           # echo "${LINENO}: --- ncmpidiff $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc ---"
+           ${MPIRUN} ${NCMPIDIFF} $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc
+        fi
+
+        if test "x${ENABLE_NETCDF4}" = x1 ; then
+           # echo "${LINENO}: test netCDF-4 feature"
+           ${MPIRUN} ./$i $CMD_OPTS ${OUT_FILE}.nc4 4
+           # Validator does not support nc4
+        fi
+    done # intra_aggr
+    done # mpiio_mode
+
+    DIFF_OPT="-q"
+    # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc ---"
+    $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc
+    # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc ---"
+    $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc
+    # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc ---"
+    $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc
+
+    done # safe_modes
+    rm -f ${OUTDIR}/$i*nc*
+done # check_PROGRAMS
 
diff --git a/benchmarks/WRF-IO/wrf_io.c b/benchmarks/WRF-IO/wrf_io.c
index 66714b5df..923c4b755 100644
--- a/benchmarks/WRF-IO/wrf_io.c
+++ b/benchmarks/WRF-IO/wrf_io.c
@@ -75,6 +75,8 @@ static int verbose, debug;
     }                                                   \
 }
 
+#define HINT ((flag)?(value):("NOT SET"))
+
 typedef struct {
     int varid;
     char *name;
@@ -683,17 +685,27 @@ int wrf_w_benchmark(char       *out_file,
         printf("                                %.2f GiB/s\n", bw/1024.0/max_t[0]);
         printf("-----------------------------------------------------------\n");
         MPI_Info_get(info_used, "striping_factor",  MPI_MAX_INFO_VAL, value, &flag);
-        printf("MPI-IO hint striping_factor:        %s\n", value);
+        printf("MPI-IO hint striping_factor:        %s\n", HINT);
         MPI_Info_get(info_used, "striping_unit",    MPI_MAX_INFO_VAL, value, &flag);
-        printf("MPI-IO hint striping_unit:          %s\n", value);
+        printf("MPI-IO hint striping_unit:          %s\n", HINT);
         MPI_Info_get(info_used, "cb_buffer_size",   MPI_MAX_INFO_VAL, value, &flag);
-        printf("MPI-IO hint cb_buffer_size:         %s\n", value);
-        MPI_Info_get(info_used, "aggr_list",        MPI_MAX_INFO_VAL, value, &flag);
-        printf("MPI-IO hint aggr_list:              %s\n", value);
+        printf("MPI-IO hint cb_buffer_size:         %s\n", HINT);
         MPI_Info_get(info_used, "cb_nodes",         MPI_MAX_INFO_VAL, value, &flag);
-        printf("MPI-IO hint cb_nodes:               %s\n", value);
+        printf("MPI-IO hint cb_nodes:               %s\n", HINT);
+        MPI_Info_get(info_used, "cb_config_list",   MPI_MAX_INFO_VAL, value, &flag);
+        printf("MPI-IO hint cb_config_list:         %s\n", HINT);
+        MPI_Info_get(info_used, "cb_node_list",     MPI_MAX_INFO_VAL, value, &flag);
+        printf("MPI-IO hint cb_node_list:           %s\n", HINT);
+        MPI_Info_get(info_used, "nc_pncio",         MPI_MAX_INFO_VAL, value, &flag);
+        printf("PnetCDF hint nc_pncio:              %s\n", HINT);
         MPI_Info_get(info_used, "nc_num_aggrs_per_node",MPI_MAX_INFO_VAL, value, &flag);
-        printf("PnetCDF hint nc_num_aggrs_per_node: %s\n", value);
+        printf("PnetCDF hint nc_num_aggrs_per_node: %s\n", HINT);
+        MPI_Info_get(info_used, "nc_ina_node_list", MPI_MAX_INFO_VAL, value, &flag);
+        printf("PnetCDF hint nc_ina_node_list:      %s\n", HINT);
+        MPI_Info_get(info_used, "cray_cb_nodes_multiplier", MPI_MAX_INFO_VAL, value, &flag);
+        printf("Hint cray_cb_nodes_multiplier:      %s\n", HINT);
+        MPI_Info_get(info_used, "cray_cb_write_lock_mode", MPI_MAX_INFO_VAL, value, &flag);
+        printf("Hint cray_cb_write_lock_mode:       %s\n", HINT);
         printf("-----------------------------------------------------------\n");
     }
     MPI_Info_free(&info_used);
@@ -843,6 +855,7 @@ int wrf_r_benchmark(char       *in_file,
             if (vars[i].nelems == 0) continue;
 
             /* set record ID */
+/* TODO: check number of records in input file, must be >= ntimes */
             vars[i].start[0] = j;
 
             if (vars[i].xtype == NC_FLOAT)
@@ -913,17 +926,27 @@ int wrf_r_benchmark(char       *in_file,
         printf("                                %.2f GiB/s\n", bw/1024.0/max_t[0]);
         printf("-----------------------------------------------------------\n");
         MPI_Info_get(info_used, "striping_factor",  MPI_MAX_INFO_VAL, value, &flag);
-        printf("MPI-IO hint striping_factor:        %s\n", value);
+        printf("MPI-IO hint striping_factor:        %s\n", HINT);
         MPI_Info_get(info_used, "striping_unit",    MPI_MAX_INFO_VAL, value, &flag);
-        printf("MPI-IO hint striping_unit:          %s\n", value);
+        printf("MPI-IO hint striping_unit:          %s\n", HINT);
         MPI_Info_get(info_used, "cb_buffer_size",   MPI_MAX_INFO_VAL, value, &flag);
-        printf("MPI-IO hint cb_buffer_size:         %s\n", value);
-        MPI_Info_get(info_used, "cb_node_list",     MPI_MAX_INFO_VAL, value, &flag);
-        printf("MPI-IO hint cb_node_list:           %s\n", value);
+        printf("MPI-IO hint cb_buffer_size:         %s\n", HINT);
         MPI_Info_get(info_used, "cb_nodes",         MPI_MAX_INFO_VAL, value, &flag);
-        printf("MPI-IO hint cb_nodes:               %s\n", value);
+        printf("MPI-IO hint cb_nodes:               %s\n", HINT);
+        MPI_Info_get(info_used, "cb_config_list",   MPI_MAX_INFO_VAL, value, &flag);
+        printf("MPI-IO hint cb_config_list:         %s\n", HINT);
+        MPI_Info_get(info_used, "cb_node_list",     MPI_MAX_INFO_VAL, value, &flag);
+        printf("MPI-IO hint cb_node_list:           %s\n", HINT);
+        MPI_Info_get(info_used, "nc_pncio",         MPI_MAX_INFO_VAL, value, &flag);
+        printf("PnetCDF hint nc_pncio:              %s\n", HINT);
         MPI_Info_get(info_used, "nc_num_aggrs_per_node",MPI_MAX_INFO_VAL, value, &flag);
-        printf("PnetCDF hint nc_num_aggrs_per_node: %s\n", value);
+        printf("PnetCDF hint nc_num_aggrs_per_node: %s\n", HINT);
+        MPI_Info_get(info_used, "nc_ina_node_list", MPI_MAX_INFO_VAL, value, &flag);
+        printf("PnetCDF hint nc_ina_node_list:      %s\n", HINT);
+        MPI_Info_get(info_used, "cray_cb_nodes_multiplier", MPI_MAX_INFO_VAL, value, &flag);
+        printf("Hint cray_cb_nodes_multiplier:      %s\n", HINT);
+        MPI_Info_get(info_used, "cray_cb_write_lock_mode", MPI_MAX_INFO_VAL, value, &flag);
+        printf("Hint cray_cb_write_lock_mode:       %s\n", HINT);
         printf("-----------------------------------------------------------\n");
     }
     MPI_Info_free(&info_used);
diff --git a/configure.ac b/configure.ac
index 142612104..8fec505ba 100644
--- a/configure.ac
+++ b/configure.ac
@@ -15,7 +15,7 @@ dnl AC_REVISION([$Revision$])dnl
 dnl autoconf v2.70 and later is required. See https://github.com/Parallel-NetCDF/PnetCDF/issues/94
 dnl autoconf v2.70 was released in 2021-01-28
 AC_PREREQ([2.70])
-AC_INIT([PnetCDF], [1.14.1],
+AC_INIT([PnetCDF], [1.15.0-alpha],
         [parallel-netcdf@mcs.anl.gov],
         [pnetcdf],
         [https://parallel-netcdf.github.io])
@@ -69,8 +69,8 @@ AM_EXTRA_RECURSIVE_TARGETS([tests])
 dnl parse the version numbers to 4 env variables
 PNETCDF_VERSION_MAJOR=`echo ${PACKAGE_VERSION} | cut -d. -f1`
 PNETCDF_VERSION_MINOR=`echo ${PACKAGE_VERSION} | cut -d. -f2`
-PNETCDF_VERSION_SUB=`echo ${PACKAGE_VERSION} | cut -d. -f3`
-PNETCDF_VERSION_PRE=`echo ${PACKAGE_VERSION} | cut -d. -f4`
+PNETCDF_VERSION_SUB=`echo ${PACKAGE_VERSION} | cut -d. -f3 | cut -d'-' -f1`
+PNETCDF_VERSION_PRE=`echo ${PACKAGE_VERSION} | cut -d'-' -f2`
 
 dnl Note major, minor, and sub are required, but pre is not.
 PNETCDF_VERSION=${PACKAGE_VERSION}
@@ -1175,6 +1175,8 @@ dnl AC_CHECK_FUNCS([memset setlocale sqrt strchr strrchr strtol])
 dnl AC_CHECK_LIB([m], [tanh])
 dnl UD_CHECK_LIB_MATH
 
+AC_CHECK_HEADERS([unistd.h fcntl.h malloc.h stddef.h sys/types.h limits.h time.h dirent.h])
+
 dnl When using gcc based compiler with -ansi flag, AC_CHECK_FUNCS can still
 dnl find strdup, but AC_CHECK_DECL cannot. So we check with AC_CHECK_DECL
 dnl first and then check AC_CHECK_FUNCS.
@@ -1377,8 +1379,11 @@ AC_CHECK_FUNCS([MPI_Type_create_subarray_c \
                 MPI_Type_get_true_extent_c \
                 MPI_Type_get_envelope_c \
                 MPI_Type_get_contents_c \
+                MPI_Status_set_elements_x \
                 MPI_Bcast_c \
                 MPI_Get_count_c \
+                MPI_Isend_c \
+                MPI_Irecv_c \
                 MPI_Pack_c \
                 MPI_Unpack_c \
                 MPI_File_read_at_c \
@@ -1459,6 +1464,14 @@ if test "$mpi_version" -ge "3" ; then
                           [], [], [[#include <mpi.h>]])
 fi
 
+# check some MPI combiner types that are used internally in PnetCDF
+UD_CHECK_MPI_CONSTANTS([MPI_COMBINER_DUP,
+                        MPI_COMBINER_SUBARRAY,
+                        MPI_COMBINER_DARRAY,
+                        MPI_COMBINER_INDEXED_BLOCK,
+                        MPI_COMBINER_HINDEXED_BLOCK],
+                       [], [], [[#include <mpi.h>]])
+
 dnl Check presence of various MPI error classes. Introduced in MPI 2.0.
 dnl These could be enums, so we have to do compile checks.
 dnl AC_CHECK_DECLS([MPI_ERR_FILE_EXISTS,
@@ -1521,6 +1534,94 @@ dnl     UD_CHECK_MPI_DATATYPE(MPI_REAL8)		dnl first defined in MPI 1.0
 dnl     UD_CHECK_MPI_DATATYPE(MPI_DOUBLE_PRECISION)	dnl first defined in MPI 1.0
 dnl fi
 
+AC_MSG_CHECKING([whether MPI_Waitall takes MPI_STATUSES_IGNORE])
+if test "x${GCC}" = xyes; then
+   saved_CFLAGS=${CFLAGS}
+   CFLAGS="-Werror -Wstringop-overflow=2"
+   AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <mpi.h>]],[[
+                      int count;
+                      MPI_Request *reqs;
+                      MPI_Waitall(count, reqs, MPI_STATUSES_IGNORE);
+      ]])], [MPI_STATUSES_IGNORE=yes], [MPI_STATUSES_IGNORE=no])
+   CFLAGS=${saved_CFLAGS}
+else
+   AC_CHECK_DECL([MPI_STATUSES_IGNORE], [MPI_STATUSES_IGNORE=yes], [MPI_STATUSES_IGNORE=no] [[#include <mpi.h>]])
+fi
+AC_MSG_RESULT([$MPI_STATUSES_IGNORE])
+if test "x$MPI_STATUSES_IGNORE" = xyes ; then
+   AC_DEFINE(HAVE_MPI_STATUSES_IGNORE, 1, [Whether MPI_Waitall takes argument MPI_STATUSES_IGNORE])
+fi
+
+#
+# Check for statfs (many) and specifically f_fstypename field (BSD)
+#
+AC_CHECK_HEADERS(sys/vfs.h sys/param.h sys/mount.h sys/statvfs.h sys/stat.h sys/type.h unistd.h)
+
+AC_CHECK_FUNCS([statvfs statfs stat])
+
+AC_CHECK_MEMBERS([struct statvfs.f_basetype,
+                  struct statfs.f_fstypename,
+                  struct statfs.f_type,
+                  struct stat.st_fstype],[],[],
+             AC_INCLUDES_DEFAULT
+             [#ifdef HAVE_SYS_VFS_H
+              #include <sys/vfs.h>
+              #endif
+              #ifdef HAVE_SYS_PARAM_H
+              #include <sys/param.h>
+              #endif
+              #ifdef HAVE_SYS_MOUNT_H
+              #include <sys/mount.h>
+              #endif
+              #ifdef HAVE_SYS_STATFS_H
+              #include <sys/statfs.h>
+              #endif
+              #ifdef HAVE_SYS_STAT_H
+              #include <sys/stat.h>
+              #endif
+              #ifdef HAVE_SYS_TYPE_H
+              #include <sys/type.h>
+              #endif
+              #ifdef HAVE_UNISTD_H
+              #include <unistd.h>
+              #endif
+              ])
+
+AC_CHECK_TYPE([blksize_t],[],[AC_DEFINE_UNQUOTED([blksize_t],[__blksize_t],[Provide blksize_t if not available]) ], [[
+               #ifdef HAVE_SYS_TYPES_H
+               #include <sys/types.h>
+               #endif
+               #ifdef HAVE_SYS_STAT_H
+               #include <sys/stat.h>
+               #endif
+               #ifdef HAVE_UNISTD_H
+               #include <unistd.h>
+               #endif]] )
+
+AC_CHECK_DECLS([pwrite])
+
+#
+# Check if Lustre is available by verifying presence of lustre/lustre_user.h
+#
+has_lustre=no
+AC_CHECK_HEADERS([lustre/lustre_user.h linux/lustre/lustre_user.h],
+                 [has_lustre=yes ; break])
+if test "x$has_lustre" = xyes ; then
+   AC_DEFINE(HAVE_LUSTRE, 1, [Define for LUSTRE])
+   LIBS="$LIBS -llustreapi"
+fi
+AM_CONDITIONAL(HAVE_LUSTRE, [test x$has_lustre = xyes])
+
+if test "x$has_lustre" = xno ; then
+   AC_MSG_CHECKING([for whether mimicking Lustre])
+   minicking_lustre=no
+   if test "x$MIMIC_LUSTRE" = xyes ; then
+      AC_DEFINE(MIMIC_LUSTRE, 1, [Define for mimicking LUSTRE file system])
+      minicking_lustre=yes
+   fi
+   AC_MSG_RESULT($minicking_lustre)
+fi
+
 AC_C_CHAR_UNSIGNED
 AC_C_BIGENDIAN
 AM_CONDITIONAL(IS_BIGENDIAN, [test x$ac_cv_c_bigendian = xyes])
@@ -1739,8 +1840,15 @@ if test "x${debug}" = xyes; then
    if test "x$?" != x0 ; then
       CFLAGS="$CFLAGS -g"
    fi
-   CFLAGS=`echo $CFLAGS | ${SED} 's/-O. *//g' | ${SED} 's/-fast *//g'`
-   CFLAGS="$CFLAGS -O0"
+
+   # remove -fast if set by user
+   CFLAGS=`echo $CFLAGS | ${SED} 's/-fast *//g'`
+
+   # check if -O is set by user, if not, then add -O0
+   str_found=`echo "${CFLAGS}" | ${EGREP} -- "-O"`
+   if test "x$str_found" = x ; then
+      CFLAGS="$CFLAGS -O0"
+   fi
 
    if test "x${has_mpicxx}" = xyes ; then
       str_found=`echo "${CXXFLAGS}" | ${EGREP} -- "-g"`
@@ -2603,10 +2711,10 @@ else
       # no name prefix end with ':'
       FSTYPE_PREFIX=
    else
-      # check if name prefix is one of file system types known to ROMIO
-      romio_known_fstypes=(ufs nfs xfs pvfs2 gpfs panfs lustre daos testfs ime quobyte)
+      # check if name prefix is one of file system types known to PNCIO
+      known_fstypes=(ufs nfs xfs pvfs2 gpfs panfs lustre daos testfs ime quobyte)
       known_fstype=
-      for pre in $romio_known_fstypes ; do
+      for pre in $known_fstypes ; do
           if test "$FSTYPE_PREFIX" = $pre ; then
              known_fstype=$pre
              break
@@ -2691,7 +2799,7 @@ dnl Update the version information only immediately before a public release.
 dnl PnetCDF starts with 1:0:0 (shared library is first supported in 1.9.0)
 dnl because some package distributors, such as Debian, may have already built
 dnl PnetCDF with shared libraries.
-ABIVERSION="7:0:0"
+ABIVERSION="8:0:1"
 AC_SUBST(ABIVERSION)
 if test "$enable_versioning" = "yes" ; then
    ABIVERSIONFLAGS="-version-info \$(ABIVERSION)"
@@ -2711,6 +2819,7 @@ AC_CONFIG_FILES(Makefile \
                 src/drivers/common/Makefile \
                 src/drivers/include/Makefile \
                 src/drivers/ncmpio/Makefile \
+                src/drivers/pncio/Makefile \
                 src/drivers/nc4io/Makefile \
                 src/drivers/ncadios/Makefile \
                 src/drivers/ncbbio/Makefile \
diff --git a/examples/C/Makefile.am b/examples/C/Makefile.am
index 009095a77..c1b881ebc 100644
--- a/examples/C/Makefile.am
+++ b/examples/C/Makefile.am
@@ -73,28 +73,28 @@ NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \
 CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \
              $(NC_FILES) $(TESTOUTDIR)/pthread.nc.* $(TESTOUTDIR)/testfile.nc
 
-EXTRA_DIST = parallel_run.sh run_c_examples.sh cdl_header.txt
+EXTRA_DIST = run_c_examples.sh cdl_header.txt
 
 ptest ptest4: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 "C " || exit 1
 
 ptest8: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 8 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 8 || exit 1
+	$(srcdir)/../parallel_run.sh 8 "C " || exit 1
 
 ptest3: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 3 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 3 || exit 1
+	$(srcdir)/../parallel_run.sh 3 "C " || exit 1
 
 ptests: ptest3 ptest4 ptest8
 ptest2 ptest6 ptest10:
diff --git a/examples/C/create_from_cdl.c b/examples/C/create_from_cdl.c
index a2a2ae179..2a238153d 100644
--- a/examples/C/create_from_cdl.c
+++ b/examples/C/create_from_cdl.c
@@ -168,13 +168,16 @@ int main(int argc, char **argv)
         err = ncmpi_def_var(ncid, name, xtype, ndims, dimids, &varid);
         CHECK_ERR
 
+        /* fill with default fill value */
+        err = ncmpi_def_var_fill(ncid, varid, 0, NULL);
+        CHECK_ERR
+
         /* retrieve metadata of attribute j associated with variable i */
         err = cdl_hdr_inq_nattrs(hid, i, &nattrs);
         CHECK_ERR
 
         for (j=0; j<nattrs; j++) {
             void *value;
-            nc_type xtype;
             MPI_Offset nelems;
 
             /* retrieve metadata of attribute j associated with variable i */
diff --git a/examples/C/nonblocking_write.c b/examples/C/nonblocking_write.c
index 937d0da38..ae3b4e123 100644
--- a/examples/C/nonblocking_write.c
+++ b/examples/C/nonblocking_write.c
@@ -126,7 +126,7 @@ int main(int argc, char **argv)
     extern int optind;
     extern char *optarg;
     int i, j, k, err, nerrs=0, debug=0, use_contig_buf=0, use_bput=0;
-    int nprocs, len=0, nelems, rank;
+    int nprocs, len=0, nelems, rank, format, rec_bytes;
     int *sca_buf, *fix_buf[FIX_NVARS], *rec_buf[REC_NVARS];
     int gsizes[NDIMS], psizes[NDIMS];
     double write_timing, max_write_timing, write_bw;
@@ -172,9 +172,14 @@ int main(int argc, char **argv)
                psizes[0],psizes[1],psizes[2]);
 
     starts[0] = 0;
+#if NDIMS == 3
     starts[1] = (rank / (psizes[1] * psizes[2])) % psizes[0];
     starts[2] = (rank / psizes[2]) % psizes[1];
     starts[3] = rank % psizes[2];
+#elif NDIMS == 2
+    starts[1] = (rank / psizes[1]) % psizes[0];
+    starts[2] = rank % psizes[1];
+#endif
 
     counts[0] = 1;
     nelems = 1;
@@ -206,7 +211,7 @@ int main(int argc, char **argv)
             rec_buf[i] = sca_buf + SCA_NVARS + FIX_NVARS * nelems + nelems * i;
     }
     else {
-        /* allocate individual buffers separately +1 ensure non-contiguity*/
+        /* allocate individual buffers separately +1 ensure non-contiguity */
         sca_buf = (int*) malloc(sizeof(int) * (SCA_NVARS+1));
         for (i=0; i<FIX_NVARS; i++)
             fix_buf[i] = (int *) malloc(sizeof(int) * (nelems+1));
@@ -247,6 +252,9 @@ int main(int argc, char **argv)
 
     MPI_Info_free(&info);
 
+    ncmpi_inq_format(ncid, &format);
+    rec_bytes = (format == NC_FORMAT_CDF5) ? 8 : 4;
+
     req = (int*) malloc(sizeof(int) * (SCA_NVARS + FIX_NVARS + REC_NVARS));
     st  = (int*) malloc(sizeof(int) * (SCA_NVARS + FIX_NVARS + REC_NVARS));
 
@@ -262,22 +270,22 @@ int main(int argc, char **argv)
     /* define scalar variables */
     for (i=0; i<SCA_NVARS; i++) {
         sprintf(str, "scalar_var_%d", i);
-        err = ncmpi_def_var(ncid, str, NC_INT, 0, NULL, &sca_var[i]);
-        ERR
+        err = ncmpi_def_var(ncid, str, NC_INT, 0, NULL, &sca_var[i]); ERR
+        err = ncmpi_def_var_fill(ncid, sca_var[i], 0, NULL); ERR
     }
 
     /* define fix-sized variables */
     for (i=0; i<FIX_NVARS; i++) {
         sprintf(str, "fix_var_%d", i);
-        err = ncmpi_def_var(ncid, str, NC_INT, NDIMS, dimids+1, &fix_var[i]);
-        ERR
+        err = ncmpi_def_var(ncid, str, NC_INT, NDIMS, dimids+1, &fix_var[i]); ERR
+        err = ncmpi_def_var_fill(ncid, fix_var[i], 0, NULL); ERR
     }
 
     /* define record variables */
     for (i=0; i<REC_NVARS; i++) {
         sprintf(str, "rec_var_%d", i);
-        err = ncmpi_def_var(ncid, str, NC_INT, NDIMS+1, dimids, &rec_var[i]);
-        ERR
+        err = ncmpi_def_var(ncid, str, NC_INT, NDIMS+1, dimids, &rec_var[i]); ERR
+        err = ncmpi_def_var_fill(ncid, rec_var[i], 0, NULL); ERR
     }
 
     /* exit the define mode */
@@ -288,6 +296,12 @@ int main(int argc, char **argv)
     err = ncmpi_inq_file_info(ncid, &info_used);
     ERR
 
+    /* fill all record variables */
+    for (j=0; j<NTIMES; j++)
+        for (i=0; i<REC_NVARS; i++) {
+            err = ncmpi_fill_var_rec(ncid, rec_var[i], j); ERR
+        }
+
     if (!use_bput) {
         k = 0;
 
@@ -391,7 +405,10 @@ int main(int argc, char **argv)
 
     write_timing = MPI_Wtime() - write_timing;
 
-    write_size = nelems * (FIX_NVARS + NTIMES * REC_NVARS) + SCA_NVARS;
+    write_size  = nelems * FIX_NVARS;
+    write_size += nelems * NTIMES * REC_NVARS;
+    /* scalar variables are written by rank 0 only in PnetCDF */
+    write_size += (rank == 0) ? SCA_NVARS : 0;
     write_size *= sizeof(int);
 
     MPI_Reduce(&write_size,   &sum_write_size,   1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD);
@@ -402,7 +419,7 @@ int main(int argc, char **argv)
      * header NTIME times
      */
     if (rank == 0 && verbose) {
-        sum_write_size += header_size + NTIMES * 8;
+        sum_write_size += header_size + ((REC_NVARS) ? NTIMES * rec_bytes : 0);
 
         printf("\n");
         if (use_bput)
diff --git a/examples/C/parallel_run.sh b/examples/C/parallel_run.sh
index 534c1afa1..b4a17b479 100755
--- a/examples/C/parallel_run.sh
+++ b/examples/C/parallel_run.sh
@@ -21,7 +21,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 NTHREADS=`expr $1 \* 6 - 1`
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -32,17 +32,45 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
         fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
 
         CMD_OPTS="-q ${TESTOUTDIR}/$i.nc"
         if test $i = get_vara ; then
@@ -105,6 +133,7 @@ for i in ${check_PROGRAMS} ; do
            # Validator does not support nc4
         fi
 
+    done
     done
     done
     # delete output file
diff --git a/examples/CXX/Makefile.am b/examples/CXX/Makefile.am
index a8d61db90..8d9415442 100644
--- a/examples/CXX/Makefile.am
+++ b/examples/CXX/Makefile.am
@@ -55,28 +55,28 @@ NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \
 CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \
              $(NC_FILES) a.out
 
-EXTRA_DIST = parallel_run.sh run_cxx_examples.sh
+EXTRA_DIST = run_cxx_examples.sh
 
 ptest ptest4: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 "CXX" || exit 1
 
 ptest8: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 8 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 8 || exit 1
+	$(srcdir)/../parallel_run.sh 8 "CXX" || exit 1
 
 ptest3: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 3 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 3 || exit 1
+	$(srcdir)/../parallel_run.sh 3 "CXX" || exit 1
 
 ptests: ptest3 ptest4 ptest8
 ptest2 ptest6 ptest10:
diff --git a/examples/CXX/SimpleXyWr.cpp b/examples/CXX/SimpleXyWr.cpp
index 63238c8e0..a454fe11e 100644
--- a/examples/CXX/SimpleXyWr.cpp
+++ b/examples/CXX/SimpleXyWr.cpp
@@ -74,7 +74,7 @@ int main(int argc, char *argv[])
 
     // Create some pretend data. If this wasn't an example program, we
     // would have some real data to write, for example, model output.
-    for (int i = 0; i < NX; i++)
+    for (i = 0; i < NX; i++)
         for (int j = 0; j < NY; j++)
             dataOut[i][j] = i * NY + j;
 
diff --git a/examples/CXX/parallel_run.sh b/examples/CXX/parallel_run.sh
index 34d95862b..377067709 100755
--- a/examples/CXX/parallel_run.sh
+++ b/examples/CXX/parallel_run.sh
@@ -21,7 +21,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 NTHREADS=`expr $1 \* 6 - 1`
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -32,17 +32,45 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
         fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
 
         if test $i = get_vara ; then
            ${MPIRUN} ./$i -q ${TESTOUTDIR}/put_vara.nc
@@ -97,6 +125,7 @@ for i in ${check_PROGRAMS} ; do
         fi
     done
     done
+    done
     # delete output file
     if test $i = get_vara ; then
        rm -f ${OUTDIR}/put_vara.nc
diff --git a/examples/F77/Makefile.am b/examples/F77/Makefile.am
index b25dd2c34..087ea27ee 100644
--- a/examples/F77/Makefile.am
+++ b/examples/F77/Makefile.am
@@ -41,6 +41,7 @@ check_PROGRAMS = nonblocking_write \
                  get_info \
                  hints \
                  put_vara \
+                 get_vara \
                  put_varn_real \
                  put_varn_int \
                  transpose \
@@ -80,28 +81,26 @@ NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \
 CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \
              $(NC_FILES)
 
-EXTRA_DIST += parallel_run.sh
-
 ptest ptest4: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 "F77" || exit 1
 
 ptest8: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 8 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 8 || exit 1
+	$(srcdir)/../parallel_run.sh 8 "F77" || exit 1
 
 ptest3: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 3 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 3 || exit 1
+	$(srcdir)/../parallel_run.sh 3 "F77" || exit 1
 
 ptests: ptest3 ptest4 ptest8
 ptest2 ptest6 ptest10:
diff --git a/examples/F77/get_vara.f b/examples/F77/get_vara.f
new file mode 100644
index 000000000..70f9f3a0a
--- /dev/null
+++ b/examples/F77/get_vara.f
@@ -0,0 +1,149 @@
+!
+!   Copyright (C) 2013, Northwestern University
+!   See COPYRIGHT notice in top-level directory.
+!
+! $Id$
+
+!
+! This example shows how to use nfmpi_get_vara_int_all() to read a 2D
+! 4-byte integer array in parallel. It is the opposite of program
+! 'put_vara.f'. It reads a netCDF variable defined in the input file of
+! size global_nx * global_ny where
+!    global_nx == 5 and
+!    global_ny == (4 * number of MPI processes).
+! The data partitioning pattern is a column-wise partitioning across all
+! processes. Each process writes a subarray of size nx * ny.
+! Note the description above follows the Fortran array index order.
+!
+! Example commands for MPI run.
+!
+!    % mpif77 -O2 -o get_vara get_vara.f -lpnetcdf
+!    % mpiexec -n 4 ./get_vara /pvfs2/wkliao/testfile.nc
+!
+      subroutine check(err, message)
+          implicit none
+          include 'mpif.h'
+          include 'pnetcdf.inc'
+          integer err
+          character message*(*)
+
+          ! It is a good idea to check returned value for possible error
+          if (err .NE. NF_NOERR) then
+              write(6,*) message//' '//nfmpi_strerror(err)
+              call MPI_Abort(MPI_COMM_WORLD, -1, err)
+          end if
+      end ! subroutine check
+
+      program main
+          implicit none
+          include 'mpif.h'
+          include 'pnetcdf.inc'
+
+          character(LEN=256) filename, cmd
+          integer i, j, err, ierr, nprocs, rank, get_args, dummy
+          integer cmode, ncid, varid, dimid(2)
+          integer*8 nx, ny, global_nx, global_ny
+          integer*8 starts(2), counts(2)
+          PARAMETER(nx=5, ny=4)
+          integer buf(nx,ny)
+          integer*8 malloc_size, sum_size
+          logical verbose
+
+          call MPI_Init(err)
+          call MPI_Comm_rank(MPI_COMM_WORLD, rank, err)
+          call MPI_Comm_size(MPI_COMM_WORLD, nprocs, err)
+
+          ! take filename from command-line argument if there is any
+          if (rank .EQ. 0) then
+              verbose = .TRUE.
+              filename = "testfile.nc"
+              ierr = get_args(2, cmd, filename, verbose, dummy)
+          endif
+          call MPI_Bcast(ierr, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, err)
+          if (ierr .EQ. 0) goto 999
+
+          call MPI_Bcast(verbose, 1, MPI_LOGICAL, 0, MPI_COMM_WORLD,
+     +                   err)
+          call MPI_Bcast(filename, 256, MPI_CHARACTER, 0,
+     +                   MPI_COMM_WORLD, err)
+
+          ! open file
+          err = nfmpi_open(MPI_COMM_WORLD, filename, NF_NOWRITE,
+     +                     MPI_INFO_NULL, ncid)
+          call check(err, 'In nfmpi_open: ')
+
+          ! inquire dimensions x and y
+          err = nfmpi_inq_dimid(ncid, "x", dimid(1))
+          call check(err, 'In nfmpi_inq_dim x: ')
+          err = nfmpi_inq_dimid(ncid, "y", dimid(2))
+          call check(err, 'In nfmpi_inq_dim y: ')
+
+          ! get dimension lengths for dimensions Y and X */
+          err = nfmpi_inq_dimlen(ncid, dimid(1), global_nx)
+          call check(err, 'In nfmpi_inq_dimlen x: ')
+          err = nfmpi_inq_dimlen(ncid, dimid(2), global_ny)
+          call check(err, 'In nfmpi_inq_dimlen y: ')
+
+          ! assert local array sizes
+          if (global_nx .NE. nx) then
+              print *, "Expect x dimension of size ",nx,
+     +                  " but got ",global_nx
+              call MPI_Abort(MPI_COMM_WORLD, -1, err)
+          endif
+          if (global_ny .NE. ny * nprocs) then
+              print *, "Expect y dimension of size ",ny*nprocs,
+     +                 " but got ",global_ny
+              call MPI_Abort(MPI_COMM_WORLD, -1, err)
+          endif
+
+          ! get the variable ID
+          err = nfmpi_inq_varid(ncid, "var", varid)
+          call check(err, 'In nfmpi_inq_varid: ')
+
+          do i=1, ny
+          do j=1, nx
+             buf(j,i) = -1
+          enddo
+          enddo
+
+          ! Note that in Fortran, array indices start with 1
+          starts(1) = 1
+          starts(2) = ny * rank + 1
+          counts(1) = nx
+          counts(2) = ny
+
+          err = nfmpi_get_vara_int_all(ncid, varid, starts, counts, buf)
+          call check(err, 'In nfmpi_get_vara_int_all: ')
+
+          ! close the file
+          err = nfmpi_close(ncid)
+          call check(err, 'In nfmpi_close: ')
+
+          ! check contents of read buffer
+ 997      format(A,I4,A,I4,A,I4,A,I4)
+          do i=1, ny
+          do j=1, nx
+             if (buf(j,i) .NE. rank) then
+                 print 997,"expect read buf(",j,",",i,") to be ",rank,
+     +                     " but got ",buf(j,i)
+                 stop 2
+             end if
+          enddo
+          enddo
+
+          ! check if there is any PnetCDF internal malloc residue
+ 998      format(A,I13,A)
+          err = nfmpi_inq_malloc_size(malloc_size)
+          if (err .EQ. NF_NOERR) then
+              call MPI_Reduce(malloc_size, sum_size, 1, MPI_INTEGER8,
+     +                        MPI_SUM, 0, MPI_COMM_WORLD, err)
+              if (rank .EQ. 0 .AND. sum_size .GT. 0)
+     +            print 998,
+     +            'heap memory allocated by PnetCDF internally has ',
+     +            sum_size/1048576, ' MiB yet to be freed'
+          endif
+
+ 999      call MPI_Finalize(err)
+          ! call EXIT(0) ! EXIT() is a GNU extension
+      end ! program main
+
diff --git a/examples/F77/parallel_run.sh b/examples/F77/parallel_run.sh
index 01e2b0f97..8cbdeaeed 100755
--- a/examples/F77/parallel_run.sh
+++ b/examples/F77/parallel_run.sh
@@ -21,7 +21,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 NTHREADS=`expr $1 \* 6 - 1`
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -32,17 +32,45 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
         fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
 
         if test $i = get_vara ; then
            ${MPIRUN} ./$i -q ${TESTOUTDIR}/put_vara.nc
@@ -97,6 +125,7 @@ for i in ${check_PROGRAMS} ; do
         fi
     done
     done
+    done
     # delete output file
     rm -f ${OUTDIR}/$i.nc
     rm -f ${OUTDIR}/$i.bb.nc
diff --git a/examples/F90/Makefile.am b/examples/F90/Makefile.am
index 38e616e2c..47b804654 100644
--- a/examples/F90/Makefile.am
+++ b/examples/F90/Makefile.am
@@ -69,28 +69,26 @@ NC_FILES = $(check_PROGRAMS:%=$(TESTOUTDIR)/%.nc) \
 CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out \
              $(NC_FILES)
 
-EXTRA_DIST += parallel_run.sh
-
 ptest ptest4: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 "F90" || exit 1
 
 ptest8: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 8 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 8 || exit 1
+	$(srcdir)/../parallel_run.sh 8 "F90" || exit 1
 
 ptest3: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 3 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 3 || exit 1
+	$(srcdir)/../parallel_run.sh 3 "F90" || exit 1
 
 ptests: ptest3 ptest4 ptest8
 ptest2 ptest6 ptest10:
diff --git a/examples/F90/parallel_run.sh b/examples/F90/parallel_run.sh
index 9e703060c..f58bd186b 100755
--- a/examples/F90/parallel_run.sh
+++ b/examples/F90/parallel_run.sh
@@ -21,7 +21,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 NTHREADS=`expr $1 \* 6 - 1`
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -32,17 +32,45 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
         fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
 
         if test $i = get_vara ; then
            ${MPIRUN} ./$i -q ${TESTOUTDIR}/put_vara.nc
@@ -97,6 +125,7 @@ for i in ${check_PROGRAMS} ; do
         fi
     done
     done
+    done
     # delete output file
     rm -f ${OUTDIR}/$i.nc
     rm -f ${OUTDIR}/$i.bb.nc
diff --git a/examples/Makefile.am b/examples/Makefile.am
index cd6603281..3547311a9 100644
--- a/examples/Makefile.am
+++ b/examples/Makefile.am
@@ -32,7 +32,7 @@ if ENABLE_ADIOS
    SUBDIRS += adios
 endif
 
-EXTRA_DIST = README.md
+EXTRA_DIST = README.md parallel_run.sh
 
 # below is the alternative when AM_EXTRA_RECURSIVE_TARGETS is not defined
 # TESTS_DIRS = $(SUBDIRS:%=tests-%)
diff --git a/examples/README.md b/examples/README.md
index 8083acff2..d58b15e76 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -180,6 +180,7 @@ with C, C++, F77, and F90 versions.
 * get_vara
   + ./C/get_vara.c
   + ./CXX/get_vara.cpp
+  + ./F77/get_vara.c
   + This example is the read counterpart of example `put_vara.c`. It shows how
     to use `ncmpi_get_vara_int_all()` to read a 2D 4-byte integer array in
     parallel.  It also reads a global attribute and two attribute of variable
diff --git a/examples/adios/parallel_run.sh b/examples/adios/parallel_run.sh
index 234eed593..d56a6980e 100755
--- a/examples/adios/parallel_run.sh
+++ b/examples/adios/parallel_run.sh
@@ -17,7 +17,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 let NTHREADS=$1*6-1
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -28,15 +28,11 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
            export PNETCDF_HINTS="romio_no_indep_rw=true"
         else
            export PNETCDF_HINTS=
         fi
-        if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
-        fi
         export PNETCDF_SAFE_MODE=$j
         # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
         if test "$i" = read_metadata ; then
@@ -48,6 +44,5 @@ for i in ${check_PROGRAMS} ; do
            echo "PASS:  C  parallel run on $1 processes --------------- $i"
         fi
     done
-    done
 done
 
diff --git a/examples/burst_buffer/Makefile.am b/examples/burst_buffer/Makefile.am
index 162770fcf..4df9bcb75 100644
--- a/examples/burst_buffer/Makefile.am
+++ b/examples/burst_buffer/Makefile.am
@@ -27,8 +27,6 @@ DATA_FILES = $(NC_FILES:%=%_*.data)
 CLEANFILES = $(NC_FILES) core core.* *.gcda *.gcno *.gcov gmon.out \
              $(META_FILES) $(DATA_FILES)
 
-EXTRA_DIST = parallel_run.sh
-
 TESTS_ENVIRONMENT  = export SED="$(SED)";
 TESTS_ENVIRONMENT += export srcdir="$(srcdir)";
 TESTS_ENVIRONMENT += export TESTOUTDIR="$(FSTYPE_PREFIX)$(TESTOUTDIR)";
@@ -43,21 +41,21 @@ ptest ptest4: $(check_PROGRAMS)
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 "BB " || exit 1
 
 ptest8: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 8 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 8 || exit 1
+	$(srcdir)/../parallel_run.sh 8 "BB "|| exit 1
 
 ptest3: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 3 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 3 || exit 1
+	$(srcdir)/../parallel_run.sh 3 "BB "|| exit 1
 
 ptests: ptest3 ptest4 ptest8
 ptest2 ptest6 ptest10:
diff --git a/examples/burst_buffer/parallel_run.sh b/examples/burst_buffer/parallel_run.sh
index 9fe960c9c..fbbc86966 100755
--- a/examples/burst_buffer/parallel_run.sh
+++ b/examples/burst_buffer/parallel_run.sh
@@ -21,7 +21,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 NTHREADS=`expr $1 \* 6 - 1`
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -33,17 +33,45 @@ unset PNETCDF_HINTS
 for i in ${check_PROGRAMS} ; do
     # echo "---- exec=$i"
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
         fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
 
         # echo "${MPIRUN} ./$i -q -b ${TESTOUTDIR} ${TESTOUTDIR}/$i.nc"
         ${MPIRUN} ./$i -q -b ${TESTOUTDIR} ${TESTOUTDIR}/$i.nc
@@ -56,6 +84,7 @@ for i in ${check_PROGRAMS} ; do
         # echo ""
     done
     done
+    done
     # delete output files
     rm -f ${OUTDIR}/$i.nc
 done
diff --git a/examples/parallel_run.sh b/examples/parallel_run.sh
new file mode 100755
index 000000000..f9771f743
--- /dev/null
+++ b/examples/parallel_run.sh
@@ -0,0 +1,212 @@
+#!/bin/bash
+#
+# Copyright (C) 2018, Northwestern University and Argonne National Laboratory
+# See COPYRIGHT notice in top-level directory.
+#
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+
+VALIDATOR=../../src/utils/ncvalidator/ncvalidator
+NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff
+
+MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
+# echo "MPIRUN = ${MPIRUN}"
+# echo "check_PROGRAMS=${check_PROGRAMS}"
+
+# remove file system type prefix if there is any
+OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-`
+
+# let NTHREADS=$1*6-1
+NTHREADS=`expr $1 \* 6 - 1`
+
+# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
+if test "x${PNETCDF_DEBUG}" = x1 ; then
+   safe_modes="0 1"
+else
+   safe_modes="0"
+fi
+
+# prevent user environment setting of PNETCDF_HINTS to interfere
+unset PNETCDF_HINTS
+
+fixed_length=23
+
+for i in ${check_PROGRAMS} ; do
+    for j in ${safe_modes} ; do
+        if test "$j" = 1 ; then # test only in safe mode
+           SAFE_HINTS="romio_no_indep_rw=true"
+           safe_hint="  SAFE"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+           safe_hint="NOSAFE"
+        fi
+        OUT_PREFIX="${TESTOUTDIR}/$i"
+
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.mpio"
+           driver_hint=" MPIO"
+        else
+           USEMPIO_HINTS="nc_pncio=enable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.pncio"
+           driver_hint="PNCIO"
+        fi
+    for intra_aggr in 0 1 ; do
+        if test "$intra_aggr" = 1 ; then
+           INA_HINTS="nc_num_aggrs_per_node=2"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}.ina"
+           ina_hint="  INA"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}"
+           ina_hint="NOINA"
+        fi
+
+        OUT_FILE=$INA_OUT_FILE
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
+        export PNETCDF_SAFE_MODE=$j
+        # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
+        TEST_OPTS="$safe_hint $driver_hint $ina_hint"
+
+        CMD_OPT=-q
+        IN_FILE=
+        if test "$i" = create_from_cdl ; then
+           IN_FILE=${srcdir}/cdl_header.txt
+        fi
+
+        if test "$i" = pthread ; then
+           # each MPI process created 6 threads
+           ${MPIRUN} ./$i $CMD_OPT ${OUT_FILE}.nc
+           for k in `seq 0 ${NTHREADS}` ; do
+               ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc.$k
+           done
+           if test $? = 0 ; then
+              printf "PASS: %-3s nprocs=$1 %-${fixed_length}s   -------- $i\n" $2 "$TEST_OPTS"
+           fi
+           continue
+        elif test "$i" = put_vara ; then
+           ${MPIRUN} ./$i $CMD_OPT ${OUT_FILE}.nc
+           # echo "--- validating file ${OUT_FILE}.nc"
+           ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc
+           if test $? = 0 ; then
+              printf "PASS: %-3s nprocs=$1 %-${fixed_length}s   -------- $i\n" $2 "$TEST_OPTS"
+           fi
+
+           ${MPIRUN} ./get_vara $CMD_OPT ${OUT_FILE}.nc
+           if test $? = 0 ; then
+              printf "PASS: %-3s nprocs=$1 %-${fixed_length}s   -------- get_vara\n" $2 "$TEST_OPTS"
+           fi
+        elif test "$i" = get_vara ; then
+           continue
+        elif test "$i" = create_from_cdl ; then
+           # create_from_cdl reads a CDL header file
+           ${MPIRUN} ./$i $CMD_OPT -o ${OUT_FILE}.nc $IN_FILE
+           if test $? = 0 ; then
+              printf "PASS: %-3s nprocs=$1 %-${fixed_length}s   -------- $i\n" $2 "$TEST_OPTS"
+           fi
+        else
+           # echo "${LINENO}: ${MPIRUN} ./$i $CMD_OPT ${OUT_FILE}.nc"
+           ${MPIRUN} ./$i $CMD_OPT ${OUT_FILE}.nc
+           if test $? = 0 ; then
+              printf "PASS: %-3s nprocs=$1 %-${fixed_length}s   -------- $i\n" $2 "$TEST_OPTS"
+           fi
+        fi
+
+        # echo "${LINENO}:--- validating file ${OUT_FILE}.nc"
+        ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc
+
+        if test "x${ENABLE_BURST_BUFFER}" = x1 ; then
+           # echo "---- test burst buffering feature"
+           saved_PNETCDF_HINTS=${PNETCDF_HINTS}
+           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+           # echo "${LINENO}:--- ./$i $CMD_OPT ${OUT_FILE}.bb.nc"
+           if test "$i" = create_from_cdl ; then
+              # create_from_cdl reads a CDL header file
+              ${MPIRUN} ./$i -q -o ${OUT_FILE}.bb.nc $IN_FILE
+           else
+              ${MPIRUN} ./$i $CMD_OPT ${OUT_FILE}.bb.nc
+           fi
+           if test $? = 0 ; then
+              printf "PASS: %-3s nprocs=$1 %-${fixed_length}s   -------- $i\n" $2 "$TEST_OPTS BB"
+           fi
+           export PNETCDF_HINTS=${saved_PNETCDF_HINTS}
+
+           # echo "--- validating file ${OUT_FILE}.bb.nc"
+           ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc
+
+           # compare file header only for large file tests
+           DIFF_OPT="-q"
+           if test "$i" = create_from_cdl ; then
+              DIFF_OPT+=" -h"
+           fi
+           # echo "${LINENO}: --- ncmpidiff $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc ---"
+           ${MPIRUN} ${NCMPIDIFF} $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc
+        fi
+
+        if test "x${ENABLE_NETCDF4}" = x1 ; then
+           # echo "test netCDF-4 feature"
+           ${MPIRUN} ./$i ${OUT_FILE}.nc4 4
+           if test $? = 0 ; then
+              printf "PASS: %-3s nprocs=$1 %-${fixed_length}s   -------- $i\n" $2 "$TEST_OPTS"
+           fi
+           # Validator does not support nc4
+        fi
+    done # intra_aggr
+    done # mpiio_mode
+
+    if [[ "$i" == *"vard"* ]] ; then
+       continue
+    fi
+
+    if test "$i" = get_vara ; then
+       continue
+    fi
+
+    DIFF_OPT="-q"
+    if test "$i" = create_from_cdl ; then
+       DIFF_OPT+=" -h"
+    fi
+    if test "$i" = pthread ; then
+       for j in `seq 0 ${NTHREADS}` ; do
+          # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.mpio.ina.nc.$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.mpio.ina.nc.$j
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.nc.$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.nc.$j
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.ina.nc.$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.ina.nc.$j
+       done
+    else
+       # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc
+       # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc
+       # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc
+    fi
+
+    done # safe_modes
+    rm -f ${OUTDIR}/$i*nc*
+done # check_PROGRAMS
+
diff --git a/examples/tutorial/parallel_run.sh b/examples/tutorial/parallel_run.sh
index bfd3e65a9..56af43e60 100755
--- a/examples/tutorial/parallel_run.sh
+++ b/examples/tutorial/parallel_run.sh
@@ -21,7 +21,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 NTHREADS=`expr $1 \* 6 - 1`
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -32,17 +32,45 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
         fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
 
         if test $i = "pnetcdf-read-from-master" ; then
            ${MPIRUN} ./$i ${TESTOUTDIR}/pnetcdf-write-from-master.nc
@@ -143,6 +171,7 @@ for i in ${check_PROGRAMS} ; do
         fi
     done
     done
+    done
 done
 
 rm -f ${OUTDIR}/pnetcdf-*.nc
diff --git a/src/binding/cxx/ncmpiAtt.cpp b/src/binding/cxx/ncmpiAtt.cpp
index 55fd35a1c..7432d921c 100644
--- a/src/binding/cxx/ncmpiAtt.cpp
+++ b/src/binding/cxx/ncmpiAtt.cpp
@@ -29,8 +29,8 @@ NcmpiAtt::NcmpiAtt() :
 {}
 
 // Constructor for non-null instances.
-NcmpiAtt::NcmpiAtt(bool nullObject):
-  nullObject(nullObject),
+NcmpiAtt::NcmpiAtt(bool nullObj):
+  nullObject(nullObj),
   groupId(-1),
   varId(-1)
 {}
diff --git a/src/binding/cxx/ncmpiEnumType.cpp b/src/binding/cxx/ncmpiEnumType.cpp
index 1edaacddf..04935d801 100644
--- a/src/binding/cxx/ncmpiEnumType.cpp
+++ b/src/binding/cxx/ncmpiEnumType.cpp
@@ -58,8 +58,8 @@ NcmpiEnumType::NcmpiEnumType(const NcmpiGroup& grp, const string& name):
 
 
 // constructor
-NcmpiEnumType::NcmpiEnumType(const NcmpiType& ncmpiType):
-  NcmpiType(ncmpiType)
+NcmpiEnumType::NcmpiEnumType(const NcmpiType& xType):
+  NcmpiType(xType)
 {
   // check the nctype object is the base of an Enum type
   if(getTypeClass() != NC_ENUM) throw NcmpiException("The NcmpiType object must be the base of an Enum type.",__FILE__,__LINE__);
diff --git a/src/binding/cxx/ncmpiOpaqueType.cpp b/src/binding/cxx/ncmpiOpaqueType.cpp
index a270dc2c5..c2256fd6b 100644
--- a/src/binding/cxx/ncmpiOpaqueType.cpp
+++ b/src/binding/cxx/ncmpiOpaqueType.cpp
@@ -50,8 +50,8 @@ NcmpiOpaqueType::NcmpiOpaqueType(const NcmpiGroup& grp, const string& name) :
 
 
 // constructor
-NcmpiOpaqueType::NcmpiOpaqueType(const NcmpiType& ncmpiType) :
-  NcmpiType(ncmpiType)
+NcmpiOpaqueType::NcmpiOpaqueType(const NcmpiType& xType) :
+  NcmpiType(xType)
 {
   // check the nctype object is the base of a Opaque type
   if(getTypeClass() != NC_OPAQUE) 	throw NcmpiException("The NcmpiType object must be the base of an Opaque type.",__FILE__,__LINE__);
diff --git a/src/binding/cxx/ncmpiVlenType.cpp b/src/binding/cxx/ncmpiVlenType.cpp
index aec268b5f..59ae86d0b 100644
--- a/src/binding/cxx/ncmpiVlenType.cpp
+++ b/src/binding/cxx/ncmpiVlenType.cpp
@@ -58,8 +58,8 @@ NcmpiVlenType::NcmpiVlenType(const NcmpiGroup& grp, const string& name) :
 {}
 
 // constructor
-NcmpiVlenType::NcmpiVlenType(const NcmpiType& ncmpiType):
-  NcmpiType(ncmpiType)
+NcmpiVlenType::NcmpiVlenType(const NcmpiType& xType):
+  NcmpiType(xType)
 {
   // check the nctype object is the base of a Vlen type
   if(getTypeClass() != NC_VLEN) throw NcmpiException("The NcmpiType object must be the base of a Vlen type.",__FILE__,__LINE__);
diff --git a/src/binding/f77/pnetcdf.inc.in b/src/binding/f77/pnetcdf.inc.in
index 9a5966a4d..725650f39 100644
--- a/src/binding/f77/pnetcdf.inc.in
+++ b/src/binding/f77/pnetcdf.inc.in
@@ -14,10 +14,12 @@
       integer PNETCDF_VERSION_MAJOR
       integer PNETCDF_VERSION_MINOR
       integer PNETCDF_VERSION_SUB
+      character*16 PNETCDF_VERSION_PRE
 
       parameter (PNETCDF_VERSION_MAJOR = @PNETCDF_VERSION_MAJOR@)
       parameter (PNETCDF_VERSION_MINOR = @PNETCDF_VERSION_MINOR@)
       parameter (PNETCDF_VERSION_SUB   = @PNETCDF_VERSION_SUB@)
+      parameter (PNETCDF_VERSION_PRE   = "@PNETCDF_VERSION_PRE@")
 
 !
 ! list of PnetCDF options enabled/disabled at configure time
diff --git a/src/binding/f90/nfmpi_constants.fh.in b/src/binding/f90/nfmpi_constants.fh.in
index ea6d31d37..1d9f6acf6 100644
--- a/src/binding/f90/nfmpi_constants.fh.in
+++ b/src/binding/f90/nfmpi_constants.fh.in
@@ -16,6 +16,8 @@
       PNETCDF_VERSION_MINOR = @PNETCDF_VERSION_MINOR@, &
       PNETCDF_VERSION_SUB   = @PNETCDF_VERSION_SUB@
 
+      character(len=16), parameter :: PNETCDF_VERSION_PRE = "@PNETCDF_VERSION_PRE@"
+
 !
 ! list of PnetCDF options enabled/disabled at configure time
 !
diff --git a/src/dispatchers/attr_getput.m4 b/src/dispatchers/attr_getput.m4
index dcdca8355..ac330887f 100644
--- a/src/dispatchers/attr_getput.m4
+++ b/src/dispatchers/attr_getput.m4
@@ -198,12 +198,10 @@ check_consistency_put(MPI_Comm      comm,
     /* check if buf contents is consistent across all processes */
     if (root_nelems > 0) { /* non-scalar attribute */
         /* note xsz is aligned, thus must use the exact size of buf */
-        int rank, itype_size;
+        int itype_size;
         size_t buf_size;
         void *root_buf;
 
-        MPI_Comm_rank(comm, &rank);
-
         /* for attributes, itype is nc_type, so its size is small. Thus, no
          * need to check against NC_MAX_INT.
          */
diff --git a/src/dispatchers/cdl_header_parser.c b/src/dispatchers/cdl_header_parser.c
index 421999e85..d5efc2e78 100644
--- a/src/dispatchers/cdl_header_parser.c
+++ b/src/dispatchers/cdl_header_parser.c
@@ -720,7 +720,7 @@ int cdl_hdr_open(const char *filename,
         return NC_EFILE;
     }
     rlen = fread(fbuf, 1, file_size, fptr);
-    if (rlen < 0) {
+    if (file_size > 0 && rlen == 0) {
         printf("Error in %s at %d: fail to fread file %s (%s)\n",
                __func__,__LINE__,filename,strerror(errno));
         return NC_EFILE;
diff --git a/src/dispatchers/error_codes.c b/src/dispatchers/error_codes.c
index 1a71538f0..e74a20d48 100644
--- a/src/dispatchers/error_codes.c
+++ b/src/dispatchers/error_codes.c
@@ -295,6 +295,8 @@ ncmpi_strerror(int err)
             return "Nonblocking requests already flushed.";
         case NC_EADIOS:
             return "unknown ADIOS error.";
+        case NC_EFSTYPE:
+            return "Invalid file system type.";
 
         default:
             /* check netCDF-3 and netCDF-4 errors */
@@ -719,6 +721,7 @@ ncmpi_strerrno(int err)
         case (NC_EBADLOG):			return "NC_EBADLOG";
         case (NC_EFLUSHED):			return "NC_EFLUSHED";
         case (NC_EADIOS):			return "NC_EADIOS";
+        case (NC_EFSTYPE):			return "NC_EFSTYPE";
 
         case (NC_EMULTIDEFINE):			return "NC_EMULTIDEFINE";
         case (NC_EMULTIDEFINE_OMODE):		return "NC_EMULTIDEFINE_OMODE";
diff --git a/src/dispatchers/file.c b/src/dispatchers/file.c
index b68782038..b7805d806 100644
--- a/src/dispatchers/file.c
+++ b/src/dispatchers/file.c
@@ -495,31 +495,26 @@ ncmpi_create(MPI_Comm    comm,
     else
         pncp->comm = comm;
 
+    /* fill in pncp members */
+    pncp->path = (char*) NCI_Strdup(path);
+    if (pncp->path == NULL)
+        DEBUG_RETURN_ERROR(NC_ENOMEM)
+
     /* calling the driver's create subroutine */
-    err = driver->create(pncp->comm, path, cmode, *ncidp, combined_info, &ncp);
+    err = driver->create(pncp->comm, pncp->path, cmode, *ncidp, combined_info,
+                         &ncp);
     if (status == NC_NOERR) status = err;
     if (combined_info != MPI_INFO_NULL) MPI_Info_free(&combined_info);
     if (status != NC_NOERR && status != NC_EMULTIDEFINE_CMODE) {
         del_from_PNCList(*ncidp);
         if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF)
             MPI_Comm_free(&pncp->comm); /* a collective call */
+        NCI_Free(pncp->path);
         NCI_Free(pncp);
         *ncidp = -1;
         return status;
     }
 
-    /* fill in pncp members */
-    pncp->path = (char*) NCI_Malloc(strlen(path)+1);
-    if (pncp->path == NULL) {
-        driver->close(ncp); /* close file and ignore error */
-        del_from_PNCList(*ncidp);
-        if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF)
-            MPI_Comm_free(&pncp->comm); /* a collective call */
-        NCI_Free(pncp);
-        *ncidp = -1;
-        DEBUG_RETURN_ERROR(NC_ENOMEM)
-    }
-    strcpy(pncp->path, path);
     pncp->mode       = cmode;
     pncp->driver     = driver;
     pncp->ndims      = 0;
@@ -759,8 +754,13 @@ ncmpi_open(MPI_Comm    comm,
     else
         pncp->comm = comm;
 
+    pncp->path = (char*) NCI_Strdup(path);
+    if (pncp->path == NULL)
+        DEBUG_RETURN_ERROR(NC_ENOMEM)
+
     /* calling the driver's open subroutine */
-    err = driver->open(pncp->comm, path, omode, *ncidp, combined_info, &ncp);
+    err = driver->open(pncp->comm, pncp->path, omode, *ncidp, combined_info,
+                       &ncp);
     if (status == NC_NOERR) status = err;
     if (combined_info != MPI_INFO_NULL) MPI_Info_free(&combined_info);
     if (status != NC_NOERR && status != NC_EMULTIDEFINE_OMODE &&
@@ -770,23 +770,13 @@ ncmpi_open(MPI_Comm    comm,
         del_from_PNCList(*ncidp);
         if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF)
             MPI_Comm_free(&pncp->comm); /* a collective call */
+        NCI_Free(pncp->path);
         NCI_Free(pncp);
         *ncidp = -1;
         return status;
     }
 
     /* fill in pncp members */
-    pncp->path = (char*) NCI_Malloc(strlen(path)+1);
-    if (pncp->path == NULL) {
-        driver->close(ncp); /* close file and ignore error */
-        del_from_PNCList(*ncidp);
-        if (pncp->comm != MPI_COMM_WORLD && pncp->comm != MPI_COMM_SELF)
-            MPI_Comm_free(&pncp->comm); /* a collective call */
-        NCI_Free(pncp);
-        *ncidp = -1;
-        DEBUG_RETURN_ERROR(NC_ENOMEM)
-    }
-    strcpy(pncp->path, path);
     pncp->mode       = omode;
     pncp->driver     = driver;
     pncp->ndims      = 0;
@@ -1251,9 +1241,8 @@ ncmpi_inq_file_format(const char *filename,
                     __func__,__LINE__,filename);
         DEBUG_RETURN_ERROR(NC_EFILE)
     }
-    if (close(fd) == -1) {
+    if (close(fd) == -1)
         DEBUG_RETURN_ERROR(NC_EFILE)
-    }
 
     if (memcmp(signature, cdf_signature, 3) == 0) {
              if (signature[3] == 5)  *formatp = NC_FORMAT_CDF5;
diff --git a/src/drivers/Makefile.am b/src/drivers/Makefile.am
index 3749fcd99..de1a6a092 100644
--- a/src/drivers/Makefile.am
+++ b/src/drivers/Makefile.am
@@ -6,7 +6,7 @@
 #
 # @configure_input@
 
-SUBDIRS = include common ncmpio
+SUBDIRS = include common ncmpio pncio
 
 if BUILD_DRIVER_FOO
    SUBDIRS += ncfoo
@@ -24,7 +24,7 @@ if ENABLE_ADIOS
    SUBDIRS += ncadios
 endif
 
-DIST_SUBDIRS = include common ncmpio ncfoo ncbbio nc4io ncadios
+DIST_SUBDIRS = include common ncmpio ncfoo ncbbio nc4io ncadios pncio
 
 # For VPATH build (parallel build), try delete all sub-directories
 distclean-local:
diff --git a/src/drivers/common/mem_alloc.c b/src/drivers/common/mem_alloc.c
index 279dd44b4..7def10c6c 100644
--- a/src/drivers/common/mem_alloc.c
+++ b/src/drivers/common/mem_alloc.c
@@ -9,13 +9,15 @@
        NCI_Malloc(size)
        NCI_Calloc(nelems, esize)
        NCI_Realloc(ptr, size)
+       NCI_Strdup(ptr)
        NCI_Free(ptr)
 
    In macro.h, they are macro-replaced to
-       NCI_Malloc_fn(size, __LINE__, __FILE__) and
-       NCI_Calloc_fn(nelems, esize, __LINE__, __FILE__) and
+       NCI_Malloc_fn(size, __LINE__, __func__, __FILE__)
+       NCI_Calloc_fn(nelems, esize, __LINE__, __func__, __FILE__)
        NCI_Realloc_fn(ptr, size, __LINE__, __func__, __FILE__)
-       NCI_Free_fn(ptr,__LINE__,__FILE__).
+       NCI_Strdup_fn(ptr, __LINE__, __func__, __FILE__)
+       NCI_Free_fn(ptr, __LINE__, __func__, __FILE__).
  */
 
 #ifdef HAVE_CONFIG_H
diff --git a/src/drivers/common/utils.c b/src/drivers/common/utils.c
index e60b6e30a..ffac366ac 100644
--- a/src/drivers/common/utils.c
+++ b/src/drivers/common/utils.c
@@ -61,7 +61,7 @@ ncmpii_xlen_nc_type(nc_type xtype, int *size)
     }
 }
 
-/* File system types recognized by ROMIO in MPICH 4.0.0 */
+/* File system types recognized by ROMIO in MPICH 4.0.0, and by PnetCDF */
 static const char* fstypes[] = {"ufs", "nfs", "xfs", "pvfs2", "gpfs", "panfs", "lustre", "daos", "testfs", "ime", "quobyte", NULL};
 
 /* Return a pointer to filename by removing the file system type prefix name if
@@ -91,3 +91,152 @@ char* ncmpii_remove_file_system_type_prefix(const char *filename)
     return ret_filename;
 }
 
+/*----< ncmpii_construct_node_list() >---------------------------------------*/
+/* This subroutine is a collective call. It finds the affinity of each MPI
+ * process to the compute node and returns the followings:
+ *   num_nodes_ptr  Number of unique nodes (host names)
+ *   node_ids_ptr   [nprocs] node IDs of each rank, must be freed by caller.
+ */
+int
+ncmpii_construct_node_list(MPI_Comm   comm,
+                           int       *num_nodes_ptr, /* OUT: */
+                           int      **node_ids_ptr)  /* OUT: [nprocs] */
+{
+    char my_procname[MPI_MAX_PROCESSOR_NAME], **all_procnames=NULL;
+    int i, j, k, rank, nprocs, num_nodes, my_procname_len, root=0;
+    int *node_ids=NULL, *all_procname_lens=NULL;
+
+    MPI_Comm_size(comm, &nprocs);
+    MPI_Comm_rank(comm, &rank);
+
+    /* Collect host name of alocated compute nodes. Note my_procname is null
+     * character terminated, but my_procname_len does not include the null
+     * character.
+     */
+    MPI_Get_processor_name(my_procname, &my_procname_len);
+#if 0
+#ifdef MIMIC_LUSTRE
+#define MIMIC_NUM_NODES 1
+    /* mimic number of compute nodes = MIMIC_NUM_NODES */
+    int node_id, np_per_node = nprocs / MIMIC_NUM_NODES;
+    if (nprocs % MIMIC_NUM_NODES > 0) np_per_node++;
+    if (rank < np_per_node * (nprocs % MIMIC_NUM_NODES))
+        node_id = rank / np_per_node;
+    else
+        node_id = (rank - np_per_node * (nprocs % MIMIC_NUM_NODES)) / (nprocs / MIMIC_NUM_NODES) + (nprocs % MIMIC_NUM_NODES);
+
+    sprintf(my_procname,"compute.node.%d", node_id);
+    my_procname_len = (int)strlen(my_procname);
+#endif
+#endif
+
+    my_procname_len++; /* to include terminate null character */
+
+    if (rank == root) {
+        /* root collects all procnames */
+        all_procnames = (char **) NCI_Malloc(sizeof(char*) * nprocs);
+        if (all_procnames == NULL)
+            DEBUG_RETURN_ERROR(NC_ENOMEM)
+
+        all_procname_lens = (int *) NCI_Malloc(sizeof(int) * nprocs);
+        if (all_procname_lens == NULL) {
+            NCI_Free(all_procnames);
+            DEBUG_RETURN_ERROR(NC_ENOMEM)
+        }
+    }
+    /* gather process name lengths from all processes first */
+    MPI_Gather(&my_procname_len, 1, MPI_INT, all_procname_lens, 1, MPI_INT,
+               root, comm);
+
+    if (rank == root) {
+        int *disp;
+        size_t alloc_size = 0;
+
+        for (i=0; i<nprocs; i++)
+            alloc_size += all_procname_lens[i];
+
+        all_procnames[0] = (char *) NCI_Malloc(alloc_size);
+        if (all_procnames[0] == NULL) {
+            NCI_Free(all_procname_lens);
+            NCI_Free(all_procnames);
+            DEBUG_RETURN_ERROR(NC_ENOMEM)
+        }
+
+        /* Construct displacement array for the MPI_Gatherv, as each process
+         * may have a different length for its process name.
+         */
+        disp = (int *) NCI_Malloc(sizeof(int) * nprocs);
+        disp[0] = 0;
+        for (i=1; i<nprocs; i++) {
+            all_procnames[i] = all_procnames[i - 1] + all_procname_lens[i - 1];
+            disp[i] = disp[i - 1] + all_procname_lens[i - 1];
+        }
+
+        /* gather all process names */
+        MPI_Gatherv(my_procname, my_procname_len, MPI_CHAR,
+                    all_procnames[0], all_procname_lens, disp, MPI_CHAR,
+                    root, comm);
+
+        NCI_Free(disp);
+        NCI_Free(all_procname_lens);
+    } else
+        /* send process name to root */
+        MPI_Gatherv(my_procname, my_procname_len, MPI_CHAR,
+                    NULL, NULL, NULL, MPI_CHAR, root, comm);
+
+    /* compute node IDs of each MPI process */
+    node_ids = (int *) NCI_Malloc(sizeof(int) * (nprocs + 1));
+
+    if (rank == root) {
+        /* all_procnames[] can tell us the number of nodes and number of
+         * processes per node.
+         */
+        char **node_names;
+        int last;
+
+        /* array of pointers pointing to unique host names (compute nodes) */
+        node_names = (char **) NCI_Malloc(sizeof(char*) * nprocs);
+
+        /* calculate node_ids[] */
+        last = 0;
+        num_nodes = 0; /* number of unique compute nodes */
+        for (i=0; i<nprocs; i++) {
+            k = last;
+            for (j=0; j<num_nodes; j++) {
+                /* check if [i] has already appeared in [] */
+                if (!strcmp(all_procnames[i], node_names[k])) { /* found */
+                    node_ids[i] = k;
+                    break;
+                }
+                k = (k == num_nodes - 1) ? 0 : k + 1;
+            }
+            if (j < num_nodes)  /* found, next iteration, start with node n */
+                last = k;
+            else {      /* not found, j == num_nodes, add a new node */
+                node_names[j] = strdup(all_procnames[i]);
+                node_ids[i] = j;
+                last = j;
+                num_nodes++;
+            }
+        }
+        /* num_nodes is now the number of compute nodes (unique node names) */
+
+        for (i=0; i<num_nodes; i++)
+            free(node_names[i]); /* allocated by strdup() */
+        NCI_Free(node_names);
+        NCI_Free(all_procnames[0]);
+        NCI_Free(all_procnames);
+
+        /* piggyback num_nodes to MPI_Bcast */
+        node_ids[nprocs] = num_nodes;
+    }
+
+    /* broadcast compute node IDs of each MPI process */
+    MPI_Bcast(node_ids, nprocs+1, MPI_INT, root, comm);
+
+    *node_ids_ptr = node_ids;
+    *num_nodes_ptr = node_ids[nprocs];
+
+    return NC_NOERR;
+}
+
diff --git a/src/drivers/include/common.h b/src/drivers/include/common.h
index ee69fb854..3fb3ded5e 100644
--- a/src/drivers/include/common.h
+++ b/src/drivers/include/common.h
@@ -249,5 +249,9 @@ extern int strcasecmp(const char *s1, const char *s2);
 
 char* ncmpii_remove_file_system_type_prefix(const char *filename);
 
+extern int
+ncmpii_construct_node_list(MPI_Comm comm, int *num_nodes_ptr,
+                           int **node_ids_ptr);
+
 #endif
 
diff --git a/src/drivers/ncbbio/ncbbio_file.c b/src/drivers/ncbbio/ncbbio_file.c
index 271f1b969..a3d13d3d9 100644
--- a/src/drivers/ncbbio/ncbbio_file.c
+++ b/src/drivers/ncbbio/ncbbio_file.c
@@ -375,6 +375,11 @@ ncbbio_begin_indep_data(void *ncdp)
     int err;
     NC_bb *ncbbp = (NC_bb*)ncdp;
 
+    err = ncbbio_sync(ncdp);
+    if (err != NC_NOERR) return err;
+
+    MPI_Barrier(ncbbp->comm);
+
     err = ncbbp->ncmpio_driver->begin_indep_data(ncbbp->ncp);
     if (err != NC_NOERR) return err;
 
diff --git a/src/drivers/ncmpio/Makefile.am b/src/drivers/ncmpio/Makefile.am
index c1afe76c1..9cbd78e13 100644
--- a/src/drivers/ncmpio/Makefile.am
+++ b/src/drivers/ncmpio/Makefile.am
@@ -12,6 +12,7 @@ AM_CPPFLAGS  = -I${top_srcdir}/src/include
 AM_CPPFLAGS += -I${top_builddir}/src/include
 AM_CPPFLAGS += -I${top_srcdir}/src/drivers/include
 AM_CPPFLAGS += -I${top_builddir}/src/drivers/include
+AM_CPPFLAGS += -I${top_srcdir}/src/drivers/pncio
 
 if PNETCDF_DEBUG
    AM_CPPFLAGS += -DPNETCDF_DEBUG
diff --git a/src/drivers/ncmpio/ncmpio_NC.h b/src/drivers/ncmpio/ncmpio_NC.h
index 0e2c71e81..f64567ab0 100644
--- a/src/drivers/ncmpio/ncmpio_NC.h
+++ b/src/drivers/ncmpio/ncmpio_NC.h
@@ -16,6 +16,7 @@
 
 #include <dispatch.h>
 #include "ncmpio_driver.h"
+#include "pncio.h"
 
 #define NC_DEFAULT_H_MINFREE 0
 #define NC_DEFAULT_V_ALIGN   512
@@ -156,7 +157,7 @@ typedef struct {
  * specifications can be of type 8-byte integers.
  */
 typedef struct NC_dimarray {
-    int            ndefined;      /* number of defined dimensions */
+    int            ndefined;      /* no. defined dimensions */
     int            unlimited_id;  /* -1 for not defined, otherwise >= 0 */
     NC_dim       **value;
     int            hash_size;
@@ -180,7 +181,7 @@ ncmpio_dup_NC_dimarray(NC_dimarray *ncap, const NC_dimarray *ref);
  * NC attribute
  */
 typedef struct {
-    MPI_Offset nelems;   /* number of attribute elements */
+    MPI_Offset nelems;   /* no. attribute elements */
     MPI_Offset xsz;      /* amount of space at xvalue (4-byte aligned) */
     nc_type    xtype;    /* external NC data type of the attribute */
     size_t     name_len; /* strlen(name) for faster string compare */
@@ -199,7 +200,7 @@ typedef struct {
  * specifications can be of type 8-byte integers.
  */
 typedef struct NC_attrarray {
-    int            ndefined;  /* number of defined attributes */
+    int            ndefined;  /* no. defined attributes */
     NC_attr      **value;
     int            hash_size;
     NC_nametable  *nameT;
@@ -238,7 +239,7 @@ typedef struct {
     int           no_fill; /* whether fill mode is disabled */
     size_t        name_len;/* strlen(name) for faster string compare */
     char         *name;    /* name of the variable */
-    int           ndims;   /* number of dimensions */
+    int           ndims;   /* no. dimensions */
     int          *dimids;  /* [ndims] array of dimension IDs */
     MPI_Offset   *shape;   /* [ndims] dim->size of each dim
                               shape[0] == NC_UNLIMITED if record variable */
@@ -268,8 +269,8 @@ typedef struct {
  */
 /* note: we only allow less than 2^31-1 variables defined in a file */
 typedef struct NC_vararray {
-    int            ndefined;    /* number of defined variables */
-    int            num_rec_vars;/* number of defined record variables */
+    int            ndefined;    /* no. defined variables */
+    int            num_rec_vars;/* no. defined record variables */
     NC_var       **value;
     int            hash_size;
     NC_nametable  *nameT;
@@ -319,15 +320,15 @@ typedef struct NC_lead_req {
     int           flag;         /* bit-wise OR of the above NC_REQ_* flags */
     int           id;           /* even number for write, odd for read */
     int           nonlead_off;  /* start index in the non-lead queue */
-    int           nonlead_num;  /* number of non-lead requests */
+    int           nonlead_num;  /* no. non-lead requests */
     int           abuf_index;   /* index in the abuf occupy_table. -1 means not
                                    using attached buffer */
     void         *buf;          /* user buffer */
     void         *xbuf;         /* buffer in external type, may be == buf */
     NC_var       *varp;         /* pointer to NC variable object */
-    MPI_Offset    nelems;       /* total number of array elements requested */
+    MPI_Offset    nelems;       /* total no. array elements requested */
     MPI_Offset    max_rec;      /* highest record requested */
-    MPI_Offset    bufcount;     /* number of buftype in this request */
+    MPI_Offset    bufcount;     /* no. buftype in this request */
     MPI_Offset   *start;        /* [varp->ndims*3] for start/count/stride */
     MPI_Datatype  buftype;      /* user defined derived data type */
     MPI_Datatype  itype;        /* internal element data type in buftype */
@@ -338,10 +339,11 @@ typedef struct NC_lead_req {
 typedef struct NC_req {
     MPI_Offset    offset_start; /* starting offset of aggregate access region */
     MPI_Offset    offset_end;   /*   ending offset of aggregate access region */
-    MPI_Offset    nelems;       /* number of array elements requested */
+    MPI_Offset    nelems;       /* no. array elements requested */
     MPI_Offset   *start;        /* [varp->ndims*3] for start/count/stride */
     void         *xbuf;         /* buffer in external type, used in file I/O calls */
     int           lead_off;     /* start index in the lead queue */
+    MPI_Aint      npairs;       /* no. flattened offset-length pairs */
 } NC_req;
 
 #define NC_ABUF_DEFAULT_TABLE_SIZE 128
@@ -382,11 +384,10 @@ struct NC {
     int           safe_mode;    /* 0 or 1, for parameter consistency check */
 #ifdef ENABLE_SUBFILING
     int           subfile_mode; /* 0 or 1, for disable/enable subfiling */
-    int           num_subfiles; /* number of subfiles */
+    int           num_subfiles; /* no. subfiles */
     struct NC    *ncp_sf;       /* ncp of subfile */
     MPI_Comm      comm_sf;      /* subfile MPI communicator */
 #endif
-    int           striping_unit; /* stripe size of the file */
     int           chunk;       /* chunk size for reading header, one chunk at a time */
     MPI_Offset    v_align;     /* alignment of the beginning of fixed-size variables */
     MPI_Offset    r_align;     /* file alignment for record variable section */
@@ -407,16 +408,20 @@ struct NC {
 
     MPI_Offset    recsize;   /* length of 'record': sum of single record sizes
                                 of all the record variables */
-    MPI_Offset    numrecs;   /* number of 'records' allocated */
+    MPI_Offset    numrecs;   /* no. 'records' allocated */
     MPI_Offset    put_size;  /* amount of writes committed so far in bytes */
     MPI_Offset    get_size;  /* amount of reads  committed so far in bytes */
 
     MPI_Comm      comm;           /* MPI communicator */
     int           rank;           /* MPI rank of this process */
-    int           nprocs;         /* number of MPI processes */
+    int           nprocs;         /* no. MPI processes */
+    int           num_nodes;      /* no. unique compute nodes allocated */
+    int          *node_ids;       /* [nprocs] node IDs of each rank */
     MPI_Info      mpiinfo;        /* used MPI info object */
-    MPI_File      collective_fh;  /* file handle for collective mode */
-    MPI_File      independent_fh; /* file handle for independent mode */
+    MPI_File      collective_fh;  /* MPI-IO file handle for collective mode */
+    MPI_File      independent_fh; /* MPI-IO file handle for independent mode */
+    PNCIO_File    *pncio_fh;      /* PNCIO file handler */
+    int           fstype;         /* file system type: PNCIO_LUSTRE, PNCIO_UFS */
 
     NC_dimarray   dims;     /* dimensions defined */
     NC_attrarray  attrs;    /* global attributes defined */
@@ -426,36 +431,55 @@ struct NC {
 
     int           maxGetReqID;    /* max get request ID */
     int           maxPutReqID;    /* max put request ID */
-    int           numLeadGetReqs; /* number of pending lead get requests */
-    int           numLeadPutReqs; /* number of pending lead put requests */
+    int           numLeadGetReqs; /* no. pending lead get requests */
+    int           numLeadPutReqs; /* no. pending lead put requests */
     NC_lead_req  *get_lead_list;  /* list of lead nonblocking read requests */
     NC_lead_req  *put_lead_list;  /* list of lead nonblocking write requests */
 
-    int           numGetReqs;   /* number of pending nonblocking get requests */
-    int           numPutReqs;   /* number of pending nonblocking put requests */
+    int           numGetReqs;   /* no. pending nonblocking get requests */
+    int           numPutReqs;   /* no. pending nonblocking put requests */
     NC_req       *get_list;     /* list of nonblocking read requests */
     NC_req       *put_list;     /* list of nonblocking write requests */
 
     NC_buf       *abuf;     /* attached buffer, used by bput APIs */
 
-    char         *path;     /* file name */
+    const char   *path;     /* file name */
     struct NC    *old;      /* contains the previous NC during redef. */
 
-    /* Below are used for intra-node aggregation */
-    int  num_aggrs_per_node; /* number of aggregators per compute node. Set
-                                through a user hint. 0 to disable the
-                                intra-node aggregation, -1 to let PnetCDF to
-                                decide. This value must be the same among all
-                                processes.
+    /* Below are used for intra-node aggregation (INA) */
+    MPI_Comm      ina_comm;  /* communicator of only intra-node aggregators */
+    int           ina_nprocs;/* no. processes in intra-node communicator */
+    int           ina_rank;  /* rank ID in intra-node communicator */
+    int  num_aggrs_per_node; /* no. aggregators per compute node. Set through a
+                              * user hint. 0 to disable the intra-node
+                              * aggregation, -1 to let PnetCDF to decide.This
+                              * value must be the same among all processes.
                               */
     int  my_aggr;            /* rank ID of my aggregator */
-    int  num_nonaggrs;       /* number of non-aggregators assigned */
+    int  num_nonaggrs;       /* no. non-aggregators assigned */
     int *nonaggr_ranks;      /* ranks of assigned non-aggregators */
+    int *ina_node_list;      /* rank IDs of INA aggregators */
+
 #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
-    double aggr_time;
+    double ina_time_init;
+    double ina_time_flatten;
+    double ina_time_put[5];
+    double ina_time_get[5];
+    size_t ina_npairs_put;
+    size_t ina_npairs_get;
+    size_t maxmem_put[6];
+    size_t maxmem_get[6];
 #endif
 };
 
+typedef struct bufferinfo {
+    NC         *ncp;
+    MPI_Offset  offset;   /* current read/write offset in the file */
+    char       *base;     /* beginning of read/write buffer */
+    char       *pos;      /* current position in buffer */
+    char       *end;      /* end position of buffer */
+} bufferinfo;
+
 #define NC_readonly(ncp)   fIsSet((ncp)->flags, NC_MODE_RDONLY)
 #define NC_IsNew(ncp)      fIsSet((ncp)->flags, NC_MODE_CREATE)
 #define NC_indef(ncp)      fIsSet((ncp)->flags, NC_MODE_DEF)
@@ -474,9 +498,6 @@ struct NC {
         (NC_EMULTIDEFINE_FIRST >= (err) && (err) >= NC_EMULTIDEFINE_LAST)
 
 /* Begin defined in nc.c ----------------------------------------------------*/
-extern void
-ncmpio_free_NC(NC *ncp);
-
 extern int
 ncmpio_NC_check_vlen(NC_var *varp, MPI_Offset vlen_max);
 
@@ -487,20 +508,6 @@ extern int
 ncmpio_NC_check_voffs(NC *ncp);
 
 /* Begin defined in ncmpio_header_get.c -------------------------------------*/
-typedef struct bufferinfo {
-    MPI_Comm    comm;
-    MPI_File    collective_fh;
-    MPI_Offset  get_size; /* amount of file read n bytes so far */
-    MPI_Offset  offset;   /* current read/write offset in the file */
-    int         chunk;    /* chunk size for reading the header */
-    int         version;  /* 1, 2, and 5 for CDF-1, 2, and 5 respectively */
-    int         safe_mode;/* 0: disabled, 1: enabled */
-    int         coll_mode;/* 0: independent, 1: collective */
-    char       *base;     /* beginning of read/write buffer */
-    char       *pos;      /* current position in buffer */
-    char       *end;      /* end position of buffer */
-} bufferinfo;
-
 extern MPI_Offset
 ncmpio_hdr_len_NC(const NC *ncp);
 
@@ -515,9 +522,6 @@ extern int
 ncmpio_write_header(NC *ncp);
 
 /* Begin defined in ncmpio_sync.c -------------------------------------------*/
-extern int
-ncmpio_file_sync(NC *ncp);
-
 extern int
 ncmpio_write_numrecs(NC *ncp, MPI_Offset new_numrecs);
 
@@ -528,10 +532,6 @@ ncmpio_filetype_create_vars(const NC* ncp, const NC_var* varp,
                 const MPI_Offset stride[], MPI_Offset *offset,
                 MPI_Datatype *filetype, int *is_filetype_contig);
 
-extern int
-ncmpio_file_set_view(const NC *ncp, MPI_File fh, MPI_Offset *offset,
-                MPI_Datatype filetype);
-
 /* Begin defined in ncmpio_igetput.m4 ---------------------------------------*/
 extern int
 ncmpio_abuf_malloc(NC *ncp, MPI_Offset nbytes, void **buf, int *abuf_index);
@@ -607,17 +607,16 @@ ncmpio_inq_var_fill(NC_var *varp, void *fill_value);
 extern int
 ncmpio_fill_vars(NC *ncp);
 
-/* Begin defined in ncmpio_nonblocking.c ------------------------------------*/
-extern int
-ncmpio_getput_zero_req(NC *ncp, int rw_flag);
-
-/* Begin defined in ncmpio_close.c */
-extern int
-ncmpio_close_files(NC *ncp, int doUnlink);
+/* Begin defined in ncmpio_close.c ------------------------------------------*/
+extern void
+ncmpio_free_NC(NC *ncp);
 
 /* Begin defined in ncmpio_utils.c ------------------------------------------*/
 extern void
-ncmpio_set_pnetcdf_hints(NC *ncp, MPI_Info user_info, MPI_Info info_used);
+ncmpio_hint_extract(NC *ncp, MPI_Info info);
+
+extern void
+ncmpio_hint_set(NC *ncp, MPI_Info info);
 
 extern int
 ncmpio_NC_check_name(const char *name, int file_ver);
@@ -644,23 +643,73 @@ ncmpio_unpack_xbuf(int format, NC_var *varp, MPI_Offset bufcount,
                  MPI_Datatype etype, MPI_Datatype imaptype, int need_convert,
                  int need_swap, void *buf, void *xbuf);
 
+extern int
+ncmpio_calc_off(const NC *ncp, const NC_var *varp, const MPI_Offset *start,
+                MPI_Offset *offset);
+
+extern int
+ncmpio_calc_start_end(const NC *ncp, const NC_var *varp,
+                      const MPI_Offset *start, const MPI_Offset *count,
+                      const MPI_Offset *stride, MPI_Offset *start_off,
+                      MPI_Offset *end_off);
+
 /* Begin defined in ncmpio_file_io.c ----------------------------------------*/
+extern MPI_Offset
+ncmpio_file_read_at(NC *ncp, MPI_Offset offset, void *buf,
+                    PNCIO_View buf_view);
+
+extern MPI_Offset
+ncmpio_file_read_at_all(NC *ncp, MPI_Offset offset, void *buf,
+                    PNCIO_View buf_view);
+
+extern MPI_Offset
+ncmpio_file_write_at(NC *ncp, MPI_Offset offset, const void *buf,
+                    PNCIO_View buf_view);
+
+extern MPI_Offset
+ncmpio_file_write_at_all(NC *ncp, MPI_Offset offset, const void *buf,
+                    PNCIO_View buf_view);
+
+extern int
+ncmpio_getput_zero_req(NC *ncp, int rw_flag);
+
+extern int
+ncmpio_read_write(NC *ncp, int rw_flag, MPI_Offset offset,
+                  PNCIO_View flat_btype, void *buf);
+
+extern int
+ncmpio_file_close(NC *ncp);
+
+extern int
+ncmpio_file_delete(NC *ncp);
+
+extern int
+ncmpio_file_sync(NC *ncp);
+
+extern int
+ncmpio_file_set_view(const NC *ncp, MPI_Offset disp, MPI_Datatype filetype,
+                MPI_Aint npairs,
+#ifdef HAVE_MPI_LARGE_COUNT
+                MPI_Count *offsets, MPI_Count *lengths
+#else
+                MPI_Offset *offsets, int *lengths
+#endif
+);
+
 extern int
-ncmpio_read_write(NC *ncp, int rw_flag, int coll_indep, MPI_Offset offset,
-                  MPI_Offset buf_count, MPI_Datatype buf_type, void *buf,
-                  int buftype_is_contig);
+ncmpio_file_open(NC *ncp, MPI_Comm comm, const char *path, int omode,
+                 MPI_Info info);
 
 /* Begin defined in ncmpio_intranode.c --------------------------------------*/
 extern int
-ncmpio_intra_node_aggr_init(NC *ncp);
+ncmpio_ina_init(NC *ncp);
 
 extern int
-ncmpio_intra_node_aggregation_nreqs(NC *ncp, int mode, int num_reqs,
-                                    NC_req *put_list, MPI_Offset newnumrecs);
+ncmpio_ina_nreqs(NC *ncp, int mode, int num_reqs, NC_req *put_list,
+                 MPI_Offset newnumrecs);
 extern int
-ncmpio_intra_node_aggregation(NC *ncp, int mode, NC_var *varp,
-                              const MPI_Offset *start, const MPI_Offset *count,
-                              const MPI_Offset *stride, MPI_Offset bufCount,
-                              MPI_Datatype bufType, void *buf);
+ncmpio_ina_req(NC *ncp, int mode, NC_var *varp, const MPI_Offset *start,
+               const MPI_Offset *count, const MPI_Offset *stride,
+               MPI_Offset nbytes, void *buf);
 
 #endif /* H_NC */
diff --git a/src/drivers/ncmpio/ncmpio_close.c b/src/drivers/ncmpio/ncmpio_close.c
index ec79088e8..cf8553c63 100644
--- a/src/drivers/ncmpio/ncmpio_close.c
+++ b/src/drivers/ncmpio/ncmpio_close.c
@@ -51,46 +51,11 @@ ncmpio_free_NC(NC *ncp)
     if (ncp->get_list      != NULL) NCI_Free(ncp->get_list);
     if (ncp->put_list      != NULL) NCI_Free(ncp->put_list);
     if (ncp->abuf          != NULL) NCI_Free(ncp->abuf);
-    if (ncp->path          != NULL) NCI_Free(ncp->path);
     if (ncp->nonaggr_ranks != NULL) NCI_Free(ncp->nonaggr_ranks);
 
     NCI_Free(ncp);
 }
 
-/*----< ncmpio_close_files() >-----------------------------------------------*/
-int
-ncmpio_close_files(NC *ncp, int doUnlink) {
-    char *mpi_name;
-    int mpireturn;
-
-    assert(ncp != NULL); /* this should never occur */
-
-    if (ncp->independent_fh != MPI_FILE_NULL) {
-        TRACE_IO(MPI_File_close, (&ncp->independent_fh));
-        if (mpireturn != MPI_SUCCESS)
-            return ncmpii_error_mpi2nc(mpireturn, mpi_name);
-    }
-
-    if (ncp->nprocs > 1 && ncp->collective_fh != MPI_FILE_NULL) {
-        TRACE_IO(MPI_File_close, (&ncp->collective_fh));
-        if (mpireturn != MPI_SUCCESS)
-            return ncmpii_error_mpi2nc(mpireturn, mpi_name);
-    }
-
-    if (doUnlink) {
-        /* called from ncmpi_abort, if the file is being created and is still
-         * in define mode, the file is deleted */
-        if (ncp->rank == 0) {
-            TRACE_IO(MPI_File_delete, ((char *)ncp->path, ncp->mpiinfo));
-            if (mpireturn != MPI_SUCCESS)
-                return ncmpii_error_mpi2nc(mpireturn, mpi_name);
-        }
-        if (ncp->nprocs > 1)
-            MPI_Barrier(ncp->comm);
-    }
-    return NC_NOERR;
-}
-
 /*----< ncmpio_close() >------------------------------------------------------*/
 /* This function is collective */
 int
@@ -159,8 +124,69 @@ ncmpio_close(void *ncdp)
     }
 #endif
 
-    /* calling MPI_File_close() */
-    err = ncmpio_close_files(ncp, 0);
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    int i, j, ntimers;
+    double tt[16], max_t[16], put_time=0, get_time=0;
+    MPI_Offset sizes[16], max_sizes[16], max_npairs_put=0, max_npairs_get=0;
+
+    /* print intra-node aggregation timing breakdown */
+    if (ncp->num_aggrs_per_node > 0) {
+        j = 0;
+        for (i=0; i<6; i++) sizes[j++] = ncp->maxmem_put[i];
+        for (i=0; i<6; i++) sizes[j++] = ncp->maxmem_get[i];
+        sizes[12] = ncp->ina_npairs_put;
+        sizes[13] = ncp->ina_npairs_get;
+
+        MPI_Allreduce(sizes, max_sizes, 14, MPI_OFFSET, MPI_MAX, ncp->comm);
+        max_npairs_put = max_sizes[12];
+        max_npairs_get = max_sizes[13];
+
+        for (i=0; i<12; i++) tt[i] = (float)(max_sizes[i]) / 1048576.0; /* in MiB */
+        if (ncp->rank == 0 && max_npairs_put > 0)
+            printf("%s: INA put npairs=%lld mem=%.1f %.1f %.1f %.1f %.1f %.1f (MiB)\n",
+                   __func__, max_sizes[12], tt[0],tt[1],tt[2],tt[3],tt[4],tt[5]);
+        if (ncp->rank == 0 && max_npairs_get > 0)
+            printf("%s: INA get npairs=%lld mem=%.1f %.1f %.1f %.1f %.1f %.1f (MiB)\n",
+                   __func__, max_sizes[13], tt[6],tt[7],tt[8],tt[9],tt[10],tt[11]);
+
+        if (max_npairs_put > 0) { /* put npairs > 0 */
+            put_time = ncp->ina_time_init + ncp->ina_time_flatten;
+            ntimers = 4;
+            for (i=0; i<ntimers; i++) {
+                tt[i]     = ncp->ina_time_put[i];
+                put_time += tt[i];
+            }
+            tt[ntimers]   = ncp->ina_time_init;
+            tt[ntimers+1] = ncp->ina_time_flatten;
+            tt[ntimers+2] = put_time;
+
+            MPI_Reduce(tt, max_t, ntimers+3, MPI_DOUBLE, MPI_MAX, 0, ncp->comm);
+            put_time = max_t[ntimers+2];
+            if (ncp->rank == 0)
+                printf("%s: INA put timing %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f = %5.2f\n",
+                __func__, max_t[ntimers],max_t[ntimers+1],max_t[0],max_t[1],max_t[2],max_t[3],put_time);
+        }
+        if (max_npairs_get > 0) { /* get npairs > 0 */
+            get_time = ncp->ina_time_init + ncp->ina_time_flatten;
+            ntimers = 4;
+            for (i=0; i<ntimers; i++) {
+                tt[i]     = ncp->ina_time_get[i];
+                get_time += tt[i];
+            }
+            tt[ntimers]   = ncp->ina_time_init;
+            tt[ntimers+1] = ncp->ina_time_flatten;
+            tt[ntimers+2] = get_time;
+
+            MPI_Reduce(tt, max_t, ntimers+3, MPI_DOUBLE, MPI_MAX, 0, ncp->comm);
+            if (ncp->rank == 0)
+                printf("%s: INA get timing %5.2f %5.2f %5.2f %5.2f %5.2f %5.2f = %5.2f\n",
+                __func__, max_t[ntimers],max_t[ntimers+1],max_t[0],max_t[1],max_t[2],max_t[3],max_t[ntimers+2]);
+        }
+    }
+#endif
+
+    /* close the file */
+    err = ncmpio_file_close(ncp);
     if (status == NC_NOERR) status = err;
 
     /* file is open for write and no variable has been defined */
@@ -219,6 +245,10 @@ ncmpio_close(void *ncdp)
         if (ncp->nprocs > 1) MPI_Barrier(ncp->comm);
     }
 
+    /* free the intra-node aggregation communicator */
+    if (ncp->ina_comm != MPI_COMM_NULL)
+        MPI_Comm_free(&ncp->ina_comm);
+
     /* free up space occupied by the header metadata */
     ncmpio_free_NC(ncp);
 
diff --git a/src/drivers/ncmpio/ncmpio_create.c b/src/drivers/ncmpio/ncmpio_create.c
index e5cee83d3..b1444d0b8 100644
--- a/src/drivers/ncmpio/ncmpio_create.c
+++ b/src/drivers/ncmpio/ncmpio_create.c
@@ -8,7 +8,6 @@
  * This file implements the corresponding APIs defined in src/dispatchers/file.c
  *
  * ncmpi_create() : dispatcher->create()
- * ncmpi_open()   : dispatcher->open()
  */
 
 #ifdef HAVE_CONFIG_H
@@ -42,18 +41,21 @@ ncmpio_create(MPI_Comm     comm,
               MPI_Info     user_info, /* user's and env info combined */
               void       **ncpp)
 {
-    char *env_str, *filename, *mpi_name;
+    char *env_str, *filename, value[MPI_MAX_INFO_VAL + 1], *mpi_name;
     int rank, nprocs, mpiomode, err, mpireturn, default_format, file_exist=1;
-    int use_trunc=1;
-    MPI_File fh;
-    MPI_Info info_used;
+    int use_trunc=1, flag;
+    MPI_File fh=MPI_FILE_NULL;
     NC *ncp=NULL;
 
     *ncpp = NULL;
 
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &nprocs);
+
     /* Note path's validity and cmode consistency have been checked in
-     * ncmpi_create() in src/dispatchers/file.c and
-     * path consistency will be done in MPI_File_open */
+     * ncmpi_create() in src/dispatchers/file.c and path consistency will be
+     * done in MPI_File_open.
+     */
 
     /* First, check whether cmode is valid or supported ---------------------*/
 
@@ -66,25 +68,61 @@ ncmpio_create(MPI_Comm     comm,
     /* Check cmode for other illegal flags already done in dispatcher layer */
 
     /* Get default format, in case cmode does not include either
-     * NC_64BIT_OFFSET or NC_64BIT_DATA */
+     * NC_64BIT_OFFSET or NC_64BIT_DATA.
+     */
     ncmpi_inq_default_format(&default_format);
 
-    /* Handle file clobber --------------------------------------------------*/
-    MPI_Comm_rank(comm, &rank);
-    MPI_Comm_size(comm, &nprocs);
+    /* allocate buffer for header object NC and initialize its contents */
+    ncp = (NC*) NCI_Calloc(1, sizeof(NC));
+    if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM)
+
+    *ncpp = (void*)ncp;
+
+    ncp->ncid     = ncid;
+    ncp->comm     = comm;     /* reuse comm duplicated in dispatch layer */
+    ncp->rank     = rank;
+    ncp->nprocs   = nprocs;
+
+    /* Extract hints from user_info. Two hints must be extracted now in order
+     * to continue:
+     *     nc_pncio: whether to user MPI-IO or PnetCDF's PNCIO driver.
+     *     nc_num_aggrs_per_node: number of processes per node to be the INA
+     *     aggregators.
+     *
+     * ncp->fstype will be set in ncmpio_hint_extract().
+     */
+    ncmpio_hint_extract(ncp, user_info);
 
+    if (ncp->fstype == PNCIO_FSTYPE_CHECK)
+        /* Check file system type. If the given file does not exist, check its
+         * folder. Currently PnetCDF's PNCIO drivers support Lustre
+         * (PNCIO_LUSTRE) and Unix File System (PNCIO_UFS).
+         */
+        ncp->fstype = PNCIO_FileSysType(path);
+
+#ifdef WKL_DEBUG
+if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == PNCIO_FSTYPE_MPIIO)? "PNCIO_FSTYPE_MPIIO" : (ncp->fstype == PNCIO_LUSTRE) ? "PNCIO_LUSTRE" : "PNCIO_UFS");
+#endif
+
+    /* Setting file open mode in mpiomode which may later be needed in
+     * ncmpi_begin_indep_data() to open file for independent data mode.
+     */
     mpiomode = MPI_MODE_RDWR | MPI_MODE_CREATE;
 
-    /* remove the file system type prefix name if there is any.  For example,
+    /* Remove the file system type prefix name if there is any. For example,
      * when path = "lustre:/home/foo/testfile.nc", remove "lustre:" to make
      * filename pointing to "/home/foo/testfile.nc", so it can be used in POSIX
-     * access() below
+     * access() below.
      */
     filename = ncmpii_remove_file_system_type_prefix(path);
 
-    /* Check if the file already exists, if lstat() or access() is available */
+    /* In case of clobber mode, first check if the file already exists, through
+     * a call to lstat() or access() if they are is available. If not, we
+     * assume the file exists and will add some MPI flag to open mode argument
+     * of MPI_File_open to either delete or truncate the file first.
+     */
 #ifdef HAVE_LSTAT
-    /* call lstat() to check the file if exists and if is a symbolic link */
+    /* Call lstat() to check the file if exists and if is a symbolic link */
     if (rank == 0) {
         struct stat st_buf;
         st_buf.st_mode = 0;
@@ -92,21 +130,23 @@ ncmpio_create(MPI_Comm     comm,
         if (lstat(filename, &st_buf) == -1) file_exist = 0;
         errno = 0; /* reset errno */
 
-        /* If the file is a regular file, not a symbolic link, then we can
-         * delete the file first and later create it when calling
-         * MPI_File_open() with MPI_MODE_CREATE. It is OK to delete and then
-         * re-create the file if the file is a regular file. If there are other
-         * files symbolically linked to this file, then their links will still
-         * point to this file after it is re-created.
+        /* If the file is a regular file, not a symbolic link, then we delete
+         * the file first and later create it when calling MPI_File_open() with
+         * MPI_MODE_CREATE. If the file is a regular file, not a symbolic link,
+         * it is faster to delete it and then re-create the file, as truncating
+         * it to zero size is more expensive.
          *
          * If the file is a symbolic link, then we cannot delete the file, as
-         * the link will be gone.
+         * the link will be gone. If the file is deleted and there are other
+         * files symbolically linked to this file, then their links will become
+         * invalid.
          */
         if (S_ISREG(st_buf.st_mode)) use_trunc = 0;
     }
 #elif defined HAVE_ACCESS
-    /* if access() is available, use it to check whether file already exists
-     * rank 0 calls access() and broadcasts file_exist */
+    /* If access() is available, use it to check whether file already exists,
+     * by having rank 0 to call access() and broadcast file_exist.
+     */
     if (rank == 0) {
         if (access(filename, F_OK) == -1) file_exist = 0;
         errno = 0; /* reset errno */
@@ -114,21 +154,29 @@ ncmpio_create(MPI_Comm     comm,
 #endif
 
     if (fIsSet(cmode, NC_NOCLOBBER)) {
-        /* check if file exists: NC_EEXIST is returned if the file already
-         * exists and NC_NOCLOBBER mode is used in ncmpi_create */
+        /* Error NC_EEXIST will be returned, if the file already exists and
+         * NC_NOCLOBBER mode is set in ncmpi_create.
+         */
 #ifdef HAVE_ACCESS
         if (nprocs > 1)
             TRACE_COMM(MPI_Bcast)(&file_exist, 1, MPI_INT, 0, comm);
-        if (file_exist) DEBUG_RETURN_ERROR(NC_EEXIST)
+        if (file_exist) {
+            NCI_Free(ncp);
+            DEBUG_RETURN_ERROR(NC_EEXIST)
+        }
 #else
-        /* add MPI_MODE_EXCL mode for MPI_File_open to check file existence */
+        /* Add MPI_MODE_EXCL mode for MPI_File_open, so it can error out, if
+         * the file exists.
+         */
         fSet(mpiomode, MPI_MODE_EXCL);
         errno = 0; /* reset errno, as MPI_File_open may change it */
 #endif
     }
-    else { /* NC_CLOBBER is the default mode in create */
-        /* rank 0 truncates or deletes the file and ignores error code.
-         * Note calling MPI_File_set_size is expensive as it calls truncate()
+    else {
+        /* NC_CLOBBER is the default mode in ncmpi_create(). Below, rank 0
+         * truncates or deletes the file and ignores error code.  Note in some
+         * implementation of MPI-IO, calling MPI_File_set_size is expensive as
+         * it may call truncate() by all ranks.
          */
         err = NC_NOERR;
         if (rank == 0 && file_exist) {
@@ -140,27 +188,37 @@ ncmpio_create(MPI_Comm     comm,
                 err = unlink(filename);
                 if (err < 0 && errno != ENOENT)
                     /* ignore ENOENT: file not exist */
-                    DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* other error */
+                    DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* report other error */
                 else
                     err = NC_NOERR;
 #else
                 err = NC_NOERR;
-                TRACE_IO(MPI_File_delete, ((char *)path, MPI_INFO_NULL));
-                if (mpireturn != MPI_SUCCESS) {
-                    int errorclass;
-                    MPI_Error_class(mpireturn, &errorclass);
-                    if (errorclass != MPI_ERR_NO_SUCH_FILE)
-                        /* ignore file not exist */
-                        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+                if (ncp->fstype != PNCIO_FSTYPE_MPIIO)
+                    err = PNCIO_File_delete(filename);
+                else {
+                    TRACE_IO(MPI_File_delete, (path, MPI_INFO_NULL));
+                    if (mpireturn != MPI_SUCCESS) {
+                        int errorclass;
+                        MPI_Error_class(mpireturn, &errorclass);
+                        if (errorclass != MPI_ERR_NO_SUCH_FILE)
+                            /* ignore file not exist */
+                            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+                    }
                 }
 #endif
             }
-            else { /* file is not a regular file, truncate it to zero size */
+            else {
+                /* If file is not a regular file (e.g. a symbolic link), we
+                 * cannot delete it and must truncate it to zero size. In this
+                 * case, file open mode needs to remove MPI_MODE_CREATE.
+                 */
+                mpiomode = MPI_MODE_RDWR;
+
 #ifdef HAVE_TRUNCATE
-                err = truncate(filename, 0); /* can be expensive */
+                err = truncate(filename, 0); /* This may be expensive */
                 if (err < 0 && errno != ENOENT)
                     /* ignore ENOENT: file not exist */
-                    DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* other error */
+                    DEBUG_ASSIGN_ERROR(err, NC_EFILE) /* report other error */
                 else
                     err = NC_NOERR;
 #elif defined HAVE_OPEN
@@ -173,82 +231,80 @@ ncmpio_create(MPI_Comm     comm,
                         DEBUG_ASSIGN_ERROR(err, NC_EFILE)
                 }
 #else
-                /* call MPI_File_set_size() to truncate the file. Note this can
-                 * be expensive.
+                /* When all POSIX system calls are not available, the last
+                 * resort is to call MPI_File_set_size() to truncate the file.
+                 * Note for some ROMIO versions that have all processes call
+                 * truncate(), this option can be expensive.
                  */
                 err = NC_NOERR;
-                TRACE_IO(MPI_File_open, (MPI_COMM_SELF, (char *)path, MPI_MODE_RDWR, MPI_INFO_NULL, &fh));
-                if (mpireturn != MPI_SUCCESS) {
-                    int errorclass;
-                    MPI_Error_class(mpireturn, &errorclass);
-                    err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+                if (ncp->fstype != PNCIO_FSTYPE_MPIIO) {
+                    PNCIO_File pncio_fh;
+                    pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File));
+                    err = PNCIO_File_open(MPI_COMM_SELF, filename,
+                                          MPI_MODE_RDWR, MPI_INFO_NULL,
+                                          pncio_fh);
+                    if (err == NC_NOERR)
+                        PNCIO_File_set_size(pncio_fh, 0); /* can be expensive */
+                    else
+                        PNCIO_File_close(&pncio_fh);
+                    NCI_Free(pncio_fh);
                 }
                 else {
-                    TRACE_IO(MPI_File_set_size, (fh, 0)); /* can be expensive */
+                    TRACE_IO(MPI_File_open, (MPI_COMM_SELF, path, MPI_MODE_RDWR, MPI_INFO_NULL, &fh));
                     if (mpireturn != MPI_SUCCESS) {
                         int errorclass;
                         MPI_Error_class(mpireturn, &errorclass);
                         err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
                     }
                     else {
-                        TRACE_IO(MPI_File_close, (&fh));
+                        TRACE_IO(MPI_File_set_size, (fh, 0)); /* can be expensive */
                         if (mpireturn != MPI_SUCCESS) {
                             int errorclass;
                             MPI_Error_class(mpireturn, &errorclass);
                             err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
                         }
+                        else {
+                            TRACE_IO(MPI_File_close, (&fh));
+                            if (mpireturn != MPI_SUCCESS) {
+                                int errorclass;
+                                MPI_Error_class(mpireturn, &errorclass);
+                                err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+                            }
+                        }
                     }
                 }
 #endif
             }
             if (errno == ENOENT) errno = 0; /* reset errno */
         }
-        /* all processes must wait here until file deletion is completed */
-        if (nprocs > 1)
-            TRACE_COMM(MPI_Bcast)(&err, 1, MPI_INT, 0, comm);
-        if (err != NC_NOERR) return err;
-    }
-
-    /* create file collectively -------------------------------------------- */
-    TRACE_IO(MPI_File_open, (comm, (char *)path, mpiomode, user_info, &fh));
-    if (mpireturn != MPI_SUCCESS) {
-#ifndef HAVE_ACCESS
-        if (fIsSet(cmode, NC_NOCLOBBER)) {
-            /* This is the case when NC_NOCLOBBER is used in file creation and
-             * function access() is not available. MPI_MODE_EXCL is set in open
-             * mode. When MPI_MODE_EXCL is used and the file already exists,
-             * MPI-IO should return error class MPI_ERR_FILE_EXISTS. But, some
-             * MPI-IO implementations (older ROMIO) do not correctly return
-             * this error class. In this case, we can do the followings: check
-             * errno to see if it set to EEXIST. Note usually rank 0 makes the
-             * file open call and can be the only one having errno set.
-             */
-            if (nprocs > 1)
-                TRACE_COMM(MPI_Bcast)(&errno, 1, MPI_INT, 0, comm);
-            if (errno == EEXIST) DEBUG_RETURN_ERROR(NC_EEXIST)
-        }
-#endif
-        return ncmpii_error_mpi2nc(mpireturn, mpi_name);
-        /* for NC_NOCLOBBER, MPI_MODE_EXCL was added to mpiomode. If the file
-         * already exists, MPI-IO should return error class MPI_ERR_FILE_EXISTS
-         * which PnetCDF will return error code NC_EEXIST. This is checked
-         * inside of ncmpii_error_mpi2nc()
+        /* All processes must wait here until clobbering file by root process
+         * is completed. Note mpiomode may be changed to remove MPI_MODE_CREATE
+         * when the file to be clobbered is a symbolic link.
          */
+        if (nprocs > 1) {
+            int msg[2];
+            msg[0] = err;
+            msg[1] = mpiomode;
+            TRACE_COMM(MPI_Bcast)(&msg, 2, MPI_INT, 0, comm);
+            err = msg[0];
+            mpiomode = msg[1];
+        }
+        if (err != NC_NOERR) return err;
     }
-    else
-        /* reset errno, as MPI_File_open may change it, even for MPI_SUCCESS */
-        errno = 0;
+    /* Now file has been clobbered, i.e. deleted if it is not a symbolic link.
+     * If it is a symbolic link, it now has been truncated to zero size.
+     */
 
-    /* get the I/O hints used/modified by MPI-IO */
-    TRACE_IO(MPI_File_get_info, (fh, &info_used));
-    if (mpireturn != MPI_SUCCESS)
-        return ncmpii_error_mpi2nc(mpireturn, mpi_name);
+    ncp->path     = path;     /* reuse path duplicated in dispatch layer */
+    ncp->pncio_fh = NULL;     /* non-aggregators have NULL pncio_fh */
+    ncp->mpiomode = mpiomode;
+    ncp->mpiinfo  = MPI_INFO_NULL;
 
-    /* Now the file has been successfully created, allocate/set NC object */
+    /* For file create, ignore NC_NOWRITE if set in cmode argument. */
+    ncp->iomode   = cmode | NC_WRITE;
 
-    /* allocate buffer for header object NC and initialize its contents */
-    ncp = (NC*) NCI_Calloc(1, sizeof(NC));
-    if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM)
+    ncp->collective_fh  = MPI_FILE_NULL;
+    ncp->independent_fh = MPI_FILE_NULL;
 
     /* set the file format version based on the create mode, cmode */
          if (fIsSet(cmode, NC_64BIT_DATA))   ncp->format = 5;
@@ -259,6 +315,7 @@ ncmpio_create(MPI_Comm     comm,
         else                                       ncp->format = 1;
     }
 
+    /* indicate this is from ncmpi_create */
     fSet(ncp->flags, NC_MODE_CREATE);
     /* create automatically enter write mode */
     fClr(ncp->flags, NC_MODE_RDONLY);
@@ -267,44 +324,13 @@ ncmpio_create(MPI_Comm     comm,
     /* PnetCDF default mode is no fill */
     fClr(ncp->flags, NC_MODE_FILL);
 
-    ncp->ncid = ncid;
-
-    /* chunk size for reading header, set to default before check hints */
-    ncp->chunk = PNC_DEFAULT_CHUNKSIZE;
-
-    /* calculate the true header size (not-yet aligned)
-     * No need to do this now.
-     * ncp->xsz = ncmpio_hdr_len_NC(ncp);
-     */
-
     /* initialize unlimited_id as no unlimited dimension yet defined */
     ncp->dims.unlimited_id = -1;
 
-    /* buffer to pack noncontiguous user buffers when calling wait() */
-    ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE;
-
-    /* Extract PnetCDF specific I/O hints from user_info and set default hint
-     * values into info_used. Note some MPI libraries, such as MPICH 3.3.1 and
-     * priors fail to preserve user hints that are not recognized by the MPI
-     * libraries.
-     */
-    ncmpio_set_pnetcdf_hints(ncp, user_info, info_used);
-
-    /* For file create, ignore if NC_NOWRITE set in cmode by user */
-    ncp->iomode         = cmode | NC_WRITE;
-    ncp->comm           = comm;  /* reuse comm duplicated in dispatch layer */
-    ncp->mpiinfo        = info_used; /* is not MPI_INFO_NULL */
-    ncp->mpiomode       = mpiomode;
-    ncp->rank           = rank;
-    ncp->nprocs         = nprocs;
-    ncp->collective_fh  = fh;
-    ncp->independent_fh = (nprocs > 1) ? MPI_FILE_NULL : fh;
-    ncp->path = (char*) NCI_Malloc(strlen(path) + 1);
-    strcpy(ncp->path, path);
-
 #ifdef PNETCDF_DEBUG
     /* PNETCDF_DEBUG is set at configure time, which will be overwritten by
-     * the run-time environment variable PNETCDF_SAFE_MODE */
+     * the run-time environment variable PNETCDF_SAFE_MODE.
+     */
     ncp->safe_mode = 1;
 #endif
     /* If environment variable PNETCDF_SAFE_MODE is set to 1, then we perform
@@ -313,24 +339,211 @@ ncmpio_create(MPI_Comm     comm,
     if ((env_str = getenv("PNETCDF_SAFE_MODE")) != NULL) {
         if (*env_str == '0') ncp->safe_mode = 0;
         else                 ncp->safe_mode = 1;
-        /* if PNETCDF_SAFE_MODE is set but without a value, *env_str can
-         * be '\0' (null character). In this case, safe_mode is enabled */
+        /* If PNETCDF_SAFE_MODE is set but without a value, *env_str can
+         * be '\0' (null character). In this case, safe_mode is enabled.
+         */
     }
 
-    /* determine whether to enable intra-node aggregation and set up all
-     * intra-node aggregation metadata.
-     * ncp->num_aggrs_per_node = 0, or non-zero indicates whether this feature
-     *     is enabled globally for all processes.
-     * ncp->my_aggr = -1 or >= 0 indicates whether aggregation is effectively
-     *     enabled for the aggregation group of this process.
+    /* Construct a list of unique IDs of compute nodes allocated to this job
+     * and save it in ncp->node_ids[nprocs], which contains node IDs of each
+     * rank. The node IDs are used either when intra-node aggregation (INA) is
+     * enabled or when using PnetCDF's PNCIO driver.
+     *
+     * When intra-node aggregation (INA) is enabled, node IDs are used to
+     * create a new MPI communicator consisting of the intra-node aggregators
+     * only. The communicator will be used to call file open in MPI-IO or
+     * PnetCDF's PNCIO driver. This means only intra-node aggregators will
+     * perform file I/O in PnetCDF collective put and get operations.
      */
-    ncp->my_aggr = -1;
-    if (ncp->num_aggrs_per_node != 0) {
-        err = ncmpio_intra_node_aggr_init(ncp);
+    ncp->node_ids = NULL;
+    if (ncp->fstype != PNCIO_FSTYPE_MPIIO || ncp->num_aggrs_per_node > 0) {
+        err = ncmpii_construct_node_list(comm, &ncp->num_nodes, &ncp->node_ids);
         if (err != NC_NOERR) return err;
+
+        /* When the total number of aggregators >= number of processes, disable
+         * intra-node aggregation.
+         */
+        if (ncp->num_aggrs_per_node * ncp->num_nodes >= ncp->nprocs)
+            ncp->num_aggrs_per_node = 0;
     }
 
-    *ncpp = (void*)ncp;
+    /* ncp->num_aggrs_per_node = 0, or > 0 is an indicator of whether the INA
+     * feature is disabled or enabled globally for all processes.
+     */
+    ncp->my_aggr = -1;
+    ncp->ina_comm = MPI_COMM_NULL;
+    ncp->ina_nprocs = 0;
+    ncp->ina_rank = -1;
+    ncp->ina_node_list = NULL;
+    if (ncp->num_aggrs_per_node > 0) {
+        /* Divide all ranks into groups. Each group is assigned one intra-node
+         * aggregator. The following metadata related to intra-node aggregation
+         * will be set up in ncmpio_ina_init().
+         * ncp->my_aggr is the aggregator's rank ID (related to ncp->comm) of
+         *     this group. When == ncp->rank, this rank is an aggregator.
+         * ncp->num_nonaggrs is the number of non-aggregators assigned to this
+         *     rank (an aggregator)
+         * ncp->ina_comm is an MPI communicator consisting of only intra-node
+         *     aggregators across all nodes, which will be used when calling
+         *     MPI_File_open(). For non-aggregator, it == MPI_COMM_NULL.
+         * ncp->node_ids[] will be modified to contain the nodes IDs of all
+         *     intra-node aggregators, and will be passed to pncio_fh.
+         */
+        err = ncmpio_ina_init(ncp);
+        if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err);
+
+        /* As non-aggregators will not perform any file I/O, we now can replace
+         * comm with ina_comm. Same for nprocs.
+         */
+        comm = ncp->ina_comm;
+        nprocs = ncp->ina_nprocs;
+
+        /* For non-aggregators, comm is MPI_COMM_NULL. As the remaining task of
+         * this subroutine is to open the file and obtain the file handler,
+         * non-aggregators can skip.
+         */
+        if (comm == MPI_COMM_NULL) {
+            MPI_Info_create(&ncp->mpiinfo);
+            goto fn_exit;
+        }
+    }
+
+    /* create file collectively -------------------------------------------- */
+    if (ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+        TRACE_IO(MPI_File_open, (comm, path, mpiomode, user_info, &fh));
+        if (mpireturn != MPI_SUCCESS) {
+#ifndef HAVE_ACCESS
+            if (fIsSet(cmode, NC_NOCLOBBER)) {
+                /* This is the case when NC_NOCLOBBER is used in file creation
+                 * and function access() is not available. MPI_MODE_EXCL is set
+                 * in open mode. When MPI_MODE_EXCL is used and the file
+                 * already exists, MPI-IO should return error class
+                 * MPI_ERR_FILE_EXISTS. But, some MPI-IO implementations (older
+                 * ROMIO) do not correctly return this error class. In this
+                 * case, we can do the followings: check errno to see if it set
+                 * to EEXIST. Note usually rank 0 makes the file open call and
+                 * can be the only one having errno set.
+                 */
+                if (nprocs > 1)
+                    TRACE_COMM(MPI_Bcast)(&errno, 1, MPI_INT, 0, comm);
+                if (errno == EEXIST) {
+                    NCI_Free(ncp);
+                    DEBUG_FOPEN_ERROR(NC_EEXIST)
+                }
+            }
+#endif
+            err = ncmpii_error_mpi2nc(mpireturn, "MPI_File_open");
+            DEBUG_FOPEN_ERROR(err);
+            /* for NC_NOCLOBBER, MPI_MODE_EXCL was added to mpiomode. If the
+             * file already exists, MPI-IO should return error class
+             * MPI_ERR_FILE_EXISTS which PnetCDF will return error code
+             * NC_EEXIST. This is checked inside of ncmpii_error_mpi2nc()
+             */
+        }
+        else
+            /* reset errno, as MPI_File_open may change it, even if it returns
+             * MPI_SUCCESS
+             */
+            errno = 0;
+
+        /* Now the file has been successfully created */
+        ncp->collective_fh  = fh;
+        ncp->independent_fh = (nprocs == 1) ? fh : MPI_FILE_NULL;
+
+        /* get the I/O hints used/modified by MPI-IO */
+        TRACE_IO(MPI_File_get_info, (fh, &ncp->mpiinfo));
+        if (mpireturn != MPI_SUCCESS) {
+            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+            DEBUG_FOPEN_ERROR(err);
+        }
+    }
+    else {
+        /* When ncp->fstype != PNCIO_FSTYPE_MPIIO, use PnetCDF's PNCIO driver */
+        ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1, sizeof(PNCIO_File));
+        ncp->pncio_fh->file_system = ncp->fstype;
+        ncp->pncio_fh->num_nodes   = ncp->num_nodes;
+        ncp->pncio_fh->node_ids    = ncp->node_ids;
+
+        err = PNCIO_File_open(comm, filename, mpiomode, user_info,
+                              ncp->pncio_fh);
+        if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err)
+
+        /* Now the file has been successfully created, obtain the I/O hints
+         * used/modified by PNCIO driver.
+         */
+        err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo);
+        if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err)
+    }
+
+    /* Copy MPI-IO hints into ncp->mpiinfo */
+    ncmpio_hint_set(ncp, ncp->mpiinfo);
+
+fn_exit:
+    if (ncp->num_aggrs_per_node > 0) {
+        /* When intra-node aggregation is enabled, it is necessary to make sure
+         * non-aggregators obtain consistent values of file striping hints.
+         *
+         * non-aggregator do not have hints returned from MPI_File_get_info()
+         */
+        int striping_info[2];
+        if (ncp->rank == 0) {
+            MPI_Info_get(ncp->mpiinfo, "striping_unit", MPI_MAX_INFO_VAL-1,
+                         value, &flag);
+            striping_info[0] = 0;
+            if (flag) {
+                errno = 0;  /* errno must set to zero before calling strtoll */
+                striping_info[0] = (int)strtol(value,NULL,10);
+                if (errno != 0) striping_info[0] = 0;
+            }
+
+            MPI_Info_get(ncp->mpiinfo, "striping_factor", MPI_MAX_INFO_VAL-1,
+                         value, &flag);
+            striping_info[1] = 0;
+            if (flag) {
+                errno = 0;  /* errno must set to zero before calling strtoll */
+                striping_info[1] = (int)strtol(value,NULL,10);
+                if (errno != 0) striping_info[1] = 0;
+            }
+        }
+
+        MPI_Bcast(striping_info, 2, MPI_INT, 0, ncp->comm);
+
+        if (ncp->my_aggr != ncp->rank) {
+            sprintf(value, "%d", striping_info[0]);
+            MPI_Info_set(ncp->mpiinfo, "striping_unit", value);
+            sprintf(value, "%d", striping_info[1]);
+            MPI_Info_set(ncp->mpiinfo, "striping_factor", value);
+        }
+    }
+
+/*
+if (ncp->rank == 0) {
+    int  i, nkeys;
+    MPI_Info_get_nkeys(ncp->mpiinfo, &nkeys);
+    printf("%s line %d: MPI File Info: nkeys = %d\n",__func__,__LINE__,nkeys);
+    for (i=0; i<nkeys; i++) {
+        char key[MPI_MAX_INFO_KEY];
+        int  valuelen;
+        MPI_Info_get_nthkey(ncp->mpiinfo, i, key);
+        MPI_Info_get_valuelen(ncp->mpiinfo, key, &valuelen, &flag);
+        MPI_Info_get(ncp->mpiinfo, key, valuelen+1, value, &flag);
+        printf("MPI File Info: [%2d] key = %25s, value = %s\n",i,key,value);
+    }
+}
+*/
+
+    /* ina_node_list is no longer needed */
+    if (ncp->ina_node_list != NULL) {
+        NCI_Free(ncp->ina_node_list);
+        ncp->ina_node_list = NULL;
+    }
+    /* node_ids is no longer needed */
+    if (ncp->node_ids != NULL) {
+        NCI_Free(ncp->node_ids);
+        ncp->node_ids = NULL;
+    }
+    if (ncp->pncio_fh != NULL)
+        ncp->pncio_fh->node_ids = NULL;
 
     return NC_NOERR;
 }
diff --git a/src/drivers/ncmpio/ncmpio_enddef.c b/src/drivers/ncmpio/ncmpio_enddef.c
index efc99657a..b45219a91 100644
--- a/src/drivers/ncmpio/ncmpio_enddef.c
+++ b/src/drivers/ncmpio/ncmpio_enddef.c
@@ -118,8 +118,8 @@ move_file_block(NC         *ncp,
             get_size = pread(fd, buf, chunk_size, off_from);
             if (get_size < 0) {
                 fprintf(stderr,
-                "Error at %s line %d: pread file %s offset "OFFFMT" size %zd (%s)\n",
-                __func__,__LINE__,path,off_from,chunk_size,strerror(errno));
+                "Error at %s line %d: pread file %s offset %lld size %zd (%s)\n",
+                __func__,__LINE__,path,(long long)off_from,chunk_size,strerror(errno));
                 DEBUG_RETURN_ERROR(NC_EREAD)
             }
             ncp->get_size += get_size;
@@ -138,8 +138,8 @@ move_file_block(NC         *ncp,
             put_size = pwrite(fd, buf, get_size, off_to);
             if (put_size < 0) {
                 fprintf(stderr,
-                "Error at %s line %d: pwrite file %s offset "OFFFMT" size %zd (%s)\n",
-                __func__,__LINE__,path,off_to,get_size,strerror(errno));
+                "Error at %s line %d: pwrite file %s offset %lld size %zd (%s)\n",
+                __func__,__LINE__,path,(long long)off_to,get_size,strerror(errno));
                 DEBUG_RETURN_ERROR(NC_EREAD)
             }
             ncp->put_size += put_size;
@@ -167,21 +167,21 @@ move_file_block(NC         *ncp,
                 MPI_Offset  from,   /* source      starting file offset */
                 MPI_Offset  nbytes) /* amount to be moved */
 {
-    char *mpi_name;
-    int rank, nprocs, mpireturn, err, status=NC_NOERR, do_coll;
+    int rank, nprocs, status=NC_NOERR, do_coll;
     void *buf;
     size_t num_moves, mv_amnt, p_units;
-    MPI_Offset off_last, off_from, off_to;
-    MPI_Status mpistatus;
-    MPI_File fh;
+    MPI_Offset off_last, off_from, off_to, rlen, wlen;
+    MPI_Comm comm;
 
-    rank = ncp->rank;
-    nprocs = ncp->nprocs;
-
-    /* collective_fh can be used in either MPI independent or collective I/O
-     * APIs to move data, within this subroutine.
+    /* If intra-node aggregation is enabled, then only the aggregators perform
+     * the movement.
      */
-    fh = ncp->collective_fh;
+    if (ncp->num_aggrs_per_node > 0 && ncp->ina_comm == MPI_COMM_NULL)
+        return NC_NOERR;
+
+    comm = (ncp->ina_comm == MPI_COMM_NULL) ? ncp->comm : ncp->ina_comm;
+    rank = (ncp->ina_comm == MPI_COMM_NULL) ? ncp->rank : ncp->ina_rank;
+    nprocs = (ncp->ina_comm == MPI_COMM_NULL) ? ncp->nprocs : ncp->ina_nprocs;
 
     /* MPI-IO fileview has been reset in ncmpi_redef() to make the entire file
      * visible
@@ -192,7 +192,7 @@ move_file_block(NC         *ncp,
      * independent I/O subroutines, as the data partitioned among processes are
      * not interleaved and thus need no collective I/O.
      */
-    do_coll = (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL));
+    do_coll = (nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL));
 
     /* buf will be used as a temporal buffer to move data in chunks, i.e.
      * read a chunk and later write to the new location
@@ -211,7 +211,7 @@ move_file_block(NC         *ncp,
 
     /* move the data section starting from its tail toward its beginning */
     while (nbytes > 0) {
-        int chunk_size, get_size=0;
+        int chunk_size;
 
         if (mv_amnt == p_units) {
             /* each rank moves amount of chunk_size */
@@ -231,88 +231,33 @@ move_file_block(NC         *ncp,
                 chunk_size = 0;
         }
 
-        /* explicitly initialize mpistatus object to 0. For zero-length read,
-         * MPI_Get_count may report incorrect result for some MPICH version,
-         * due to the uninitialized MPI_Status object passed to MPI-IO calls.
-         * Thus we initialize it above to work around.
-         */
-        memset(&mpistatus, 0, sizeof(MPI_Status));
-        mpireturn = MPI_SUCCESS;
+        PNCIO_View buf_view;
+        buf_view.type = MPI_BYTE;
+        buf_view.size = chunk_size;
+        buf_view.count = 1;
+        buf_view.is_contig = 1;
 
         /* read from file at off_from for amount of chunk_size */
-        if (do_coll) {
-            TRACE_IO(MPI_File_read_at_all, (fh, off_from, buf, chunk_size,
-                                            MPI_BYTE, &mpistatus));
-        }
-        else if (chunk_size > 0) {
-            TRACE_IO(MPI_File_read_at, (fh, off_from, buf, chunk_size,
-                                        MPI_BYTE, &mpistatus));
-        }
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-            if (status == NC_NOERR && err == NC_EFILE)
-                DEBUG_ASSIGN_ERROR(status, NC_EREAD)
-            get_size = chunk_size;
-        }
-        else if (chunk_size > 0) {
-            /* for zero-length read, MPI_Get_count may report incorrect result
-             * for some MPICH version, due to the uninitialized MPI_Status
-             * object passed to MPI-IO calls. Thus we initialize it above to
-             * work around. See MPICH ticket:
-             * https://trac.mpich.org/projects/mpich/ticket/2332
-             *
-             * Update the number of bytes read since file open.
-             * Because each rank reads and writes no more than one chunk_size
-             * at a time and chunk_size is < NC_MAX_INT, it is OK to call
-             * MPI_Get_count, instead of MPI_Get_count_c.
-             */
-            MPI_Get_count(&mpistatus, MPI_BYTE, &get_size);
-            ncp->get_size += get_size;
-        }
+        rlen = 0;
+        if (do_coll)
+            rlen = ncmpio_file_read_at_all(ncp, off_from, buf, buf_view);
+        else if (chunk_size > 0)
+            rlen = ncmpio_file_read_at(ncp, off_from, buf, buf_view);
+        if (status == NC_NOERR && rlen < 0) status = (int)rlen;
 
         /* to prevent from one rank's write run faster than other's read */
-        if (ncp->nprocs > 1) MPI_Barrier(ncp->comm);
-
-        /* explicitly initialize mpistatus object to 0. For zero-length read,
-         * MPI_Get_count may report incorrect result for some MPICH version,
-         * due to the uninitialized MPI_Status object passed to MPI-IO calls.
-         * Thus we initialize it above to work around.
-         */
-        memset(&mpistatus, 0, sizeof(MPI_Status));
-        mpireturn = MPI_SUCCESS;
+        if (nprocs > 1) MPI_Barrier(comm);
 
-        /* Write to new location at off_to for amount of get_size. Assuming the
-         * call to MPI_Get_count() above returns the actual amount of data read
-         * from the file, i.e. get_size.
+        /* Write to new location at off_to for amount of rlen, the actual read
+         * amount is rlen.
          */
-        if (do_coll) {
-            TRACE_IO(MPI_File_write_at_all, (fh, off_to, buf,
-                                             get_size /* NOT chunk_size */,
-                                             MPI_BYTE, &mpistatus));
-        }
-        else if (get_size > 0) {
-            TRACE_IO(MPI_File_write_at, (fh, off_to, buf,
-                                         get_size /* NOT chunk_size */,
-                                         MPI_BYTE, &mpistatus));
-        }
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-            if (status == NC_NOERR && err == NC_EFILE)
-                DEBUG_ASSIGN_ERROR(status, NC_EWRITE)
-        }
-        else if (get_size > 0) {
-            /* update the number of bytes written since file open.
-             * Because each rank reads and writes no more than one chunk_size
-             * at a time and chunk_size is < NC_MAX_INT, it is OK to call
-             * MPI_Get_count, instead of MPI_Get_count_c.
-             */
-            int put_size;
-            mpireturn = MPI_Get_count(&mpistatus, MPI_BYTE, &put_size);
-            if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED)
-                ncp->put_size += get_size; /* or chunk_size */
-            else
-                ncp->put_size += put_size;
-        }
+        buf_view.size = rlen;
+        wlen = 0;
+        if (do_coll && rlen > 0)
+            wlen = ncmpio_file_write_at_all(ncp, off_to, buf, buf_view);
+        else if (rlen > 0)
+            wlen = ncmpio_file_write_at(ncp, off_to, buf, buf_view);
+        if (status == NC_NOERR && wlen < 0) status = (int)wlen;
 
         /* move on to the next round */
         mv_amnt   = p_units;
@@ -602,20 +547,26 @@ NC_begins(NC *ncp)
 static int
 write_NC(NC *ncp)
 {
-    char *mpi_name;
-    int status=NC_NOERR, mpireturn, err, is_coll;
+    int status=NC_NOERR, is_coll=0;
     MPI_Offset i, header_wlen, ntimes;
-    MPI_File fh;
-    MPI_Status mpistatus;
+    PNCIO_View buf_view;
 
     assert(!NC_readonly(ncp));
 
+    buf_view.is_contig = 1;
+
     /* Depending on whether NC_HCOLL is set, writing file header can be done
      * through either MPI collective or independent write call.
      * When * ncp->nprocs == 1, ncp->collective_fh == ncp->independent_fh
+     * For those ranks participating the collective MPI write call, their
+     * is_coll is set to 1, otherwise 0.
      */
-    is_coll = (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL)) ? 1 : 0;
-    fh = ncp->collective_fh;
+    if (fIsSet(ncp->flags, NC_HCOLL)) {
+        if (ncp->num_aggrs_per_node > 0)
+            is_coll = (ncp->ina_nprocs > 1 && ncp->rank == ncp->my_aggr);
+        else
+            is_coll = (ncp->nprocs > 1);
+    }
 
     /* In NC_begins(), root's ncp->xsz and ncp->begin_var, root's header
      * size and extent, have been broadcast (sync-ed) among processes.
@@ -673,64 +624,44 @@ write_NC(NC *ncp)
 
         /* rank 0's fileview already includes the file header */
 
-        /* explicitly initialize mpistatus object to 0. For zero-length read,
-         * MPI_Get_count may report incorrect result for some MPICH version,
-         * due to the uninitialized MPI_Status object passed to MPI-IO calls.
-         * Thus we initialize it above to work around.
-         */
-        memset(&mpistatus, 0, sizeof(MPI_Status));
-
         /* write the header in chunks */
         offset = 0;
         remain = header_wlen;
         buf_ptr = buf;
+        buf_view.type = MPI_BYTE;
+        buf_view.count = 1;
         for (i=0; i<ntimes; i++) {
-            int bufCount = (int) MIN(remain, NC_MAX_INT);
-            if (is_coll) {
-                TRACE_IO(MPI_File_write_at_all, (fh, offset, buf_ptr, bufCount,
-                                                 MPI_BYTE, &mpistatus));
-            }
-            else {
-                TRACE_IO(MPI_File_write_at, (fh, offset, buf_ptr, bufCount,
-                                             MPI_BYTE, &mpistatus));
-            }
-            if (mpireturn != MPI_SUCCESS) {
-                err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-                /* write has failed, which is more serious than inconsistency */
-                if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(status, NC_EWRITE)
-            }
-            else {
-                /* Update the number of bytes read since file open.
-                 * Because each rank writes no more than NC_MAX_INT at a time,
-                 * it is OK to call MPI_Get_count, instead of MPI_Get_count_c.
-                 */
-                int put_size;
-                mpireturn = MPI_Get_count(&mpistatus, MPI_BYTE, &put_size);
-                if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED)
-                    ncp->put_size += bufCount;
-                else
-                    ncp->put_size += put_size;
-            }
-            offset  += bufCount;
-            buf_ptr += bufCount;
-            remain  -= bufCount;
+            MPI_Offset wlen;
+            buf_view.size = MIN(remain, NC_MAX_INT);
+            if (is_coll)
+                wlen = ncmpio_file_write_at_all(ncp, offset, buf_ptr, buf_view);
+            else
+                wlen = ncmpio_file_write_at(ncp, offset, buf_ptr, buf_view);
+            if (status == NC_NOERR && wlen < 0) status = (int)wlen;
+
+            offset  += buf_view.size;
+            buf_ptr += buf_view.size;
+            remain  -= buf_view.size;
         }
         NCI_Free(buf);
     }
-    else if (fIsSet(ncp->flags, NC_HCOLL)) {
+    else if (is_coll) {
         /* other processes participate the collective call */
-        for (i=0; i<ntimes; i++) {
-            TRACE_IO(MPI_File_write_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus));
-        }
+        buf_view.size = 0;
+        for (i=0; i<ntimes; i++)
+            ncmpio_file_write_at_all(ncp, 0, NULL, buf_view);
     }
 
 fn_exit:
     if (ncp->safe_mode == 1 && ncp->nprocs > 1) {
         /* broadcast root's status, because only root writes to the file */
-        int root_status = status;
+        int mpireturn, root_status = status;
         TRACE_COMM(MPI_Bcast)(&root_status, 1, MPI_INT, 0, ncp->comm);
-        /* root's write has failed, which is more serious than inconsistency */
-        if (root_status == NC_EWRITE) DEBUG_ASSIGN_ERROR(status, NC_EWRITE)
+        if (mpireturn != MPI_SUCCESS)
+            status = ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast");
+        else if (root_status == NC_EWRITE)
+            /* root's write has failed, more serious than inconsistency */
+            DEBUG_ASSIGN_ERROR(status, NC_EWRITE)
     }
 
     fClr(ncp->flags, NC_NDIRTY);
@@ -746,14 +677,14 @@ write_NC(NC *ncp)
  */
 #define CHECK_ERROR(err) {                                              \
     if (ncp->safe_mode == 1 && ncp->nprocs > 1) {                       \
-        int status;                                                     \
-        TRACE_COMM(MPI_Allreduce)(&err, &status, 1, MPI_INT, MPI_MIN,   \
+        int min_err;                                                    \
+        TRACE_COMM(MPI_Allreduce)(&err, &min_err, 1, MPI_INT, MPI_MIN,  \
                                   ncp->comm);                           \
         if (mpireturn != MPI_SUCCESS) {                                 \
             err = ncmpii_error_mpi2nc(mpireturn, "MPI_Allreduce");      \
             DEBUG_RETURN_ERROR(err)                                     \
         }                                                               \
-        if (status != NC_NOERR) return status;                          \
+        if (min_err != NC_NOERR) return min_err;                        \
     }                                                                   \
     else if (err != NC_NOERR)                                           \
         return err;                                                     \
@@ -1120,7 +1051,7 @@ read_hints(NC *ncp)
 
     /* get hints from the environment variable PNETCDF_HINTS, a string of
      * hints separated by ";" and each hint is in the form of hint=value. E.g.
-     * "cb_nodes=16;cb_config_list=*:6". If this environment variable is set,
+     * "cb_nodes=16;romio_ds_write=true". If this environment variable is set,
      * it overrides the same hints that were set by MPI_Info_set() called in
      * the application program.
      */
@@ -1309,26 +1240,28 @@ ncmpio__enddef(void       *ncdp,
     if (ncp->r_align == 0) ncp->r_align = 4;
     else                   ncp->r_align = D_RNDUP(ncp->r_align, 4);
 
-    /* reflect the hint changes to the MPI info object, so the user can inquire
-     * what the true hint values are being used
-     */
-    sprintf(value, OFFFMT, ncp->v_align);
-    MPI_Info_set(ncp->mpiinfo, "nc_var_align_size", value);
-    sprintf(value, OFFFMT, ncp->r_align);
-    MPI_Info_set(ncp->mpiinfo, "nc_record_align_size", value);
+    if (ncp->mpiinfo != MPI_INFO_NULL) {
+        /* reflect the hint changes to the MPI info object, so the user can
+         * inquire what the true hint values are being used
+         */
+        sprintf(value, OFFFMT, ncp->v_align);
+        MPI_Info_set(ncp->mpiinfo, "nc_var_align_size", value);
+        sprintf(value, OFFFMT, ncp->r_align);
+        MPI_Info_set(ncp->mpiinfo, "nc_record_align_size", value);
 
 #ifdef ENABLE_SUBFILING
-    sprintf(value, "%d", ncp->num_subfiles);
-    MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", value);
-    if (ncp->num_subfiles > 1) {
-        /* TODO: should return subfile-related msg when there's an error */
-        err = ncmpio_subfile_partition(ncp);
-        CHECK_ERROR(err)
-    }
+        sprintf(value, "%d", ncp->num_subfiles);
+        MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", value);
+        if (ncp->num_subfiles > 1) {
+            /* TODO: should return subfile-related msg when there's an error */
+            err = ncmpio_subfile_partition(ncp);
+            CHECK_ERROR(err)
+        }
 #else
-    MPI_Info_set(ncp->mpiinfo, "pnetcdf_subfiling", "disable");
-    MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", "0");
+        MPI_Info_set(ncp->mpiinfo, "pnetcdf_subfiling", "disable");
+        MPI_Info_set(ncp->mpiinfo, "nc_num_subfiles", "0");
 #endif
+    }
 
     /* check whether sizes of all variables are legal */
     err = ncmpio_NC_check_vlens(ncp);
@@ -1491,7 +1424,8 @@ ncmpio__enddef(void       *ncdp,
 
     /* first sync header objects in memory across all processes, and then root
      * writes the header to file. Note safe_mode error check will be done in
-     * write_NC() */
+     * write_NC().
+     */
     status = write_NC(ncp);
 
     /* we should continue to exit define mode, even if header is inconsistent
diff --git a/src/drivers/ncmpio/ncmpio_file_io.c b/src/drivers/ncmpio/ncmpio_file_io.c
index 681cfd599..e519a3d96 100644
--- a/src/drivers/ncmpio/ncmpio_file_io.c
+++ b/src/drivers/ncmpio/ncmpio_file_io.c
@@ -17,313 +17,909 @@
 #include <common.h>
 #include "ncmpio_NC.h"
 
-/*----< ncmpio_read_write() >------------------------------------------------*/
-int
-ncmpio_read_write(NC           *ncp,
-                  int           rw_flag,     /* NC_REQ_WR or NC_REQ_RD */
-                  int           coll_indep,  /* NC_REQ_COLL or NC_REQ_INDEP */
-                  MPI_Offset    offset,
-                  MPI_Offset    buf_count,
-                  MPI_Datatype  buf_type,
-                  void         *buf,
-                  int           buftype_is_contig)
+/*----< get_count() >--------------------------------------------------------*/
+/* This subroutine is independent. On success, the number of bytes read/written
+ * is returned (zero indicates nothing was read/written). Like POSIX read()/
+ * write(), it is not an error if this number is smaller than the number of
+ * bytes requested. On error, a negative value, an NC error code, is returned.
+ */
+static
+MPI_Offset get_count(MPI_Status   *mpistatus,
+                     MPI_Datatype  datatype)
 {
-    char *mpi_name;
-    int status=NC_NOERR, err=NC_NOERR, mpireturn;
-    MPI_Status mpistatus;
-    MPI_File fh;
-    MPI_Offset req_size;
+    int mpireturn;
+
+    if (datatype == MPI_DATATYPE_NULL) return 0;
 
 #ifdef HAVE_MPI_TYPE_SIZE_C
-    MPI_Count btype_size;
+    MPI_Count type_size;
     /* MPI_Type_size_c is introduced in MPI 4.0 */
-    mpireturn = MPI_Type_size_c(buf_type, &btype_size);
-    mpi_name = "MPI_Type_size_c";
+    MPI_Type_size_c(datatype, &type_size);
 #elif defined(HAVE_MPI_TYPE_SIZE_X)
-    MPI_Count btype_size;
+    MPI_Count type_size;
     /* MPI_Type_size_x is introduced in MPI 3.0 */
-    mpireturn = MPI_Type_size_x(buf_type, &btype_size);
-    mpi_name = "MPI_Type_size_x";
+    MPI_Type_size_x(datatype, &type_size);
 #else
-    int btype_size;
-    mpireturn = MPI_Type_size(buf_type, &btype_size);
-    mpi_name = "MPI_Type_size";
+    int type_size;
+    MPI_Type_size(datatype, &type_size);
 #endif
-    if (mpireturn != MPI_SUCCESS) {
-        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-        /* return the first encountered error if there is any */
-        err = (err == NC_EFILE) ? NC_EREAD : err;
-    }
-    else if (btype_size == MPI_UNDEFINED) {
-#ifdef PNETCDF_DEBUG
-        fprintf(stderr,"%d: %s line %d: btype_size MPI_UNDEFINED buf_count="OFFFMT"\n",
-                ncp->rank, __func__,__LINE__,buf_count);
+
+#ifdef HAVE_MPI_GET_COUNT_C
+    MPI_Count count;
+    mpireturn = MPI_Get_count_c(mpistatus, datatype, &count);
+#else
+    int count;
+    mpireturn = MPI_Get_count(mpistatus, datatype, &count);
 #endif
-        DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
-    }
 
-    if (err != NC_NOERR) {
-        if (coll_indep == NC_REQ_COLL) {
-            DEBUG_ASSIGN_ERROR(status, err)
-            /* write nothing, but participate the collective call */
-            buf_count = 0;
-        }
-        else
-            DEBUG_RETURN_ERROR(err)
-    }
+    if (mpireturn != MPI_SUCCESS || count == MPI_UNDEFINED)
+        /* In case of partial read/write, MPI_Get_elements() is supposed to be
+         * called to obtain the number of type map elements actually
+         * read/written in order to calculate the true read/write amount. Below
+         * skips this step and simply returns the partial read/write amount.
+         * See an example usage of MPI_Get_count() in Example 5.12 from MPI
+         * standard document.
+         */
+        return NC_EFILE;
 
-    /* request size in bytes, may be > NC_MAX_INT */
-    req_size = buf_count * btype_size;
+    return (MPI_Offset)count * type_size;
+}
 
-    /* explicitly initialize mpistatus object to 0. For zero-length read,
+/*----< ncmpio_file_read_at() >----------------------------------------------*/
+/*
+ * This function is independent.
+ */
+/* TODO: move check count against MAX_INT and call _c API */
+MPI_Offset
+ncmpio_file_read_at(NC         *ncp,
+                    MPI_Offset  offset,
+                    void       *buf,
+                    PNCIO_View  buf_view)
+{
+    int err=NC_NOERR, mpireturn;
+    MPI_Offset amnt=0;
+    MPI_Status mpistatus;
+
+    /* explicitly initialize mpistatus object to 0. For zero-length read/write,
      * MPI_Get_count may report incorrect result for some MPICH version,
      * due to the uninitialized MPI_Status object passed to MPI-IO calls.
-     * Thus we initialize it above to work around.
+     * Thus we initialize it above to work around. See MPICH ticket:
+     * https://trac.mpich.org/projects/mpich/ticket/2332
      */
     memset(&mpistatus, 0, sizeof(MPI_Status));
 
-    if (coll_indep == NC_REQ_COLL)
-        fh = ncp->collective_fh;
-    else
-        fh = ncp->independent_fh;
+    if (ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+        char *mpi_name;
+        MPI_File fh;
 
-    if (rw_flag == NC_REQ_RD) {
-        void         *xbuf=buf;
-        MPI_Datatype  xbuf_type=buf_type;
+        fh = fIsSet(ncp->flags, NC_MODE_INDEP)
+           ? ncp->independent_fh : ncp->collective_fh;
 
 #ifdef HAVE_MPI_LARGE_COUNT
-        MPI_Count xlen = (MPI_Count)buf_count;
+        MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1;
+
+        TRACE_IO(MPI_File_read_at_c, (fh, offset, buf, count, buf_view.type,
+                                      &mpistatus));
 #else
-        int xlen = (int)buf_count;
+        int count = (buf_view.is_contig) ? buf_view.size : 1;
 
-        if (buf_count > NC_MAX_INT) {
-            if (coll_indep == NC_REQ_COLL) {
+        if (buf_view.size > NC_MAX_INT) {
 #ifdef PNETCDF_DEBUG
-                fprintf(stderr,"%d: %s line %d:  NC_EINTOVERFLOW buf_count="OFFFMT"\n",
-                        ncp->rank, __func__,__LINE__,buf_count);
+            fprintf(stderr,"%d: %s line %d:  NC_EINTOVERFLOW buffer size="OFFFMT"\n",
+                    ncp->rank, __func__,__LINE__,buf_view.size);
 #endif
-                DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW)
-                /* write nothing, but participate the collective call */
-                xlen = 0;
-            }
-            else
-                DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
+            DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
         }
+        TRACE_IO(MPI_File_read_at, (fh, offset, buf, count, buf_view.type,
+                                    &mpistatus));
 #endif
+        if (mpireturn != MPI_SUCCESS) {
+            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+            if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD)
+        }
+
+        /* update the number of bytes read since file open */
+        if (err == NC_NOERR)
+            amnt = get_count(&mpistatus, buf_view.type);
+    }
+    else
+        amnt = PNCIO_File_read_at(ncp->pncio_fh, offset, buf, buf_view);
+
+    /* update the number of bytes read since file open */
+    if (amnt >= 0) ncp->get_size += amnt;
+    /* else: ignore if error, as this error is not fatal */
+
+    return amnt;
+}
+
+/*----< ncmpio_file_read_at_all() >------------------------------------------*/
+/*
+ * This function is collective.
+ */
+MPI_Offset
+ncmpio_file_read_at_all(NC         *ncp,
+                        MPI_Offset  offset,
+                        void       *buf,
+                        PNCIO_View  buf_view)
+{
+    int err=NC_NOERR, mpireturn;
+    MPI_Offset amnt=0;
+    MPI_Status mpistatus;
+
+    /* Explicitly initialize mpistatus object to 0. For zero-length read/write,
+     * MPI_Get_count may report incorrect result for some MPICH version,
+     * due to the uninitialized MPI_Status object passed to MPI-IO calls.
+     * Thus we initialize it above to work around. See MPICH ticket:
+     * https://trac.mpich.org/projects/mpich/ticket/2332
+     */
+    memset(&mpistatus, 0, sizeof(MPI_Status));
+
+    if (ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+        char *mpi_name;
+        MPI_File fh;
+
+        fh = fIsSet(ncp->flags, NC_MODE_INDEP)
+           ? ncp->independent_fh : ncp->collective_fh;
 
-        if (xlen > 0 && !buftype_is_contig && req_size <= ncp->ibuf_size) {
-            /* if read buffer is noncontiguous and size is < ncp->ibuf_size,
-             * allocate a temporary buffer and use it to read, as some MPI,
-             * e.g. Cray on KNL, can be significantly slow when read buffer is
-             * noncontiguous.
-             */
 #ifdef HAVE_MPI_LARGE_COUNT
-            xbuf_type = MPI_BYTE;
-            xlen = (MPI_Count)req_size;
+        MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1;
+
+        TRACE_IO(MPI_File_read_at_all_c, (fh, offset, buf, count,
+                                          buf_view.type, &mpistatus));
 #else
-            if (req_size > NC_MAX_INT) {
-                mpireturn = MPI_Type_contiguous(xlen, buf_type, &xbuf_type);
-                if (mpireturn != MPI_SUCCESS) {
-                    err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_contiguous");
-                    if (coll_indep == NC_REQ_COLL)
-                        DEBUG_ASSIGN_ERROR(status, err)
-                    else
-                        DEBUG_RETURN_ERROR(err)
-                }
-                MPI_Type_commit(&xbuf_type);
-                xlen = 1;
-            }
-            else {
-                xbuf_type = MPI_BYTE;
-                xlen = (int)req_size;
-            }
+        int count = (buf_view.is_contig) ? buf_view.size : 1;
+
+        if (buf_view.size > NC_MAX_INT) {
+#ifdef PNETCDF_DEBUG
+            fprintf(stderr,"%d: %s line %d:  NC_EINTOVERFLOW buffer size="OFFFMT"\n",
+                    ncp->rank, __func__,__LINE__,buf_view.size);
 #endif
-            xbuf = NCI_Malloc((size_t)req_size);
+            DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
+            /* participate the collective call, but read nothing */
+            count = 0;
+        }
+        TRACE_IO(MPI_File_read_at_all, (fh, offset, buf, count,
+                                        buf_view.type, &mpistatus));
+#endif
+        if (mpireturn != MPI_SUCCESS) {
+            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+            if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD)
         }
 
-        if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) {
+        /* update the number of bytes read since file open */
+        if (err == NC_NOERR)
+            amnt = get_count(&mpistatus, buf_view.type);
+    }
+    else
+        amnt = PNCIO_File_read_at_all(ncp->pncio_fh, offset, buf, buf_view);
+
+    /* update the number of bytes read since file open */
+    if (amnt >= 0) ncp->get_size += amnt;
+    /* else: ignore if error, as this error is not fatal */
+
+    return amnt;
+}
+
+/*----< ncmpio_file_write_at() >---------------------------------------------*/
+/*
+ * This function is independent.
+ */
+MPI_Offset
+ncmpio_file_write_at(NC         *ncp,
+                     MPI_Offset  offset,
+                     const void *buf,
+                     PNCIO_View  buf_view)
+{
+    int err=NC_NOERR, mpireturn;
+    MPI_Offset amnt=0;
+    MPI_Status mpistatus;
+
+    /* Explicitly initialize mpistatus object to 0. For zero-length read/write,
+     * MPI_Get_count may report incorrect result for some MPICH version,
+     * due to the uninitialized MPI_Status object passed to MPI-IO calls.
+     * Thus we initialize it above to work around. See MPICH ticket:
+     * https://trac.mpich.org/projects/mpich/ticket/2332
+     */
+    memset(&mpistatus, 0, sizeof(MPI_Status));
+
+    if (ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+        char *mpi_name;
+        MPI_File fh;
+
+        fh = fIsSet(ncp->flags, NC_MODE_INDEP)
+           ? ncp->independent_fh : ncp->collective_fh;
+
 #ifdef HAVE_MPI_LARGE_COUNT
-            TRACE_IO(MPI_File_read_at_all_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus));
+        MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1;
+
+        TRACE_IO(MPI_File_write_at_c, (fh, offset, buf, count, buf_view.type,
+                                       &mpistatus));
 #else
-            TRACE_IO(MPI_File_read_at_all, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus));
+        int count = (buf_view.is_contig) ? buf_view.size : 1;
+
+        if (buf_view.size > NC_MAX_INT) {
+#ifdef PNETCDF_DEBUG
+            fprintf(stderr,"%d: %s line %d:  NC_EINTOVERFLOW buffer size="OFFFMT"\n",
+                    ncp->rank, __func__,__LINE__,buf_view.size);
 #endif
-        } else {
+            DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
+        }
+        TRACE_IO(MPI_File_write_at, (fh, offset, buf, count, buf_view.type,
+                                     &mpistatus));
+#endif
+        if (mpireturn != MPI_SUCCESS) {
+            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+            if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EWRITE)
+        }
+
+        if (err == NC_NOERR)
+            amnt = get_count(&mpistatus, buf_view.type);
+    }
+    else
+        amnt = PNCIO_File_write_at(ncp->pncio_fh, offset, buf, buf_view);
+
+    /* update the number of bytes written since file open */
+    if (amnt >= 0) ncp->put_size += amnt;
+    /* else: ignore if error, as this error is not fatal */
+
+    return amnt;
+}
+
+/*----< ncmpio_file_write_at_all() >-----------------------------------------*/
+/*
+ * This function is collective.
+ */
+MPI_Offset
+ncmpio_file_write_at_all(NC         *ncp,
+                         MPI_Offset  offset,
+                         const void *buf,
+                         PNCIO_View  buf_view)
+{
+    int err=NC_NOERR, mpireturn;
+    MPI_Offset amnt=0;
+    MPI_Status mpistatus;
+
+    /* explicitly initialize mpistatus object to 0. For zero-length read/write,
+     * MPI_Get_count may report incorrect result for some MPICH version,
+     * due to the uninitialized MPI_Status object passed to MPI-IO calls.
+     * Thus we initialize it above to work around. See MPICH ticket:
+     * https://trac.mpich.org/projects/mpich/ticket/2332
+     */
+    memset(&mpistatus, 0, sizeof(MPI_Status));
+
+    if (ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+        char *mpi_name;
+        MPI_File fh;
+
+        fh = fIsSet(ncp->flags, NC_MODE_INDEP)
+           ? ncp->independent_fh : ncp->collective_fh;
+
 #ifdef HAVE_MPI_LARGE_COUNT
-            TRACE_IO(MPI_File_read_at_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus));
+        MPI_Count count = (buf_view.is_contig) ? buf_view.size : 1;
+
+        TRACE_IO(MPI_File_write_at_all_c, (fh, offset, buf, count,
+                                           buf_view.type, &mpistatus));
 #else
-            TRACE_IO(MPI_File_read_at, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus));
+        int count = (buf_view.is_contig) ? buf_view.size : 1;
+
+        if (buf_view.size > NC_MAX_INT) {
+#ifdef PNETCDF_DEBUG
+            fprintf(stderr,"%d: %s line %d:  NC_EINTOVERFLOW buffer size="OFFFMT"\n",
+                    ncp->rank, __func__,__LINE__,buf_view.size);
 #endif
+            DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
+            /* participate the collective call, but write nothing */
+            count = 0;
         }
+        TRACE_IO(MPI_File_write_at_all, (fh, offset, buf, count,
+                                         buf_view.type, &mpistatus));
+#endif
         if (mpireturn != MPI_SUCCESS) {
             err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-            /* return the first encountered error if there is any */
-            if (status == NC_NOERR) {
-                err = (err == NC_EFILE) ? NC_EREAD : err;
-                DEBUG_ASSIGN_ERROR(status, err)
-            }
+            if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EWRITE)
+        }
+
+        if (err == NC_NOERR)
+            amnt = get_count(&mpistatus, buf_view.type);
+    }
+    else
+        amnt = PNCIO_File_write_at_all(ncp->pncio_fh, offset, buf, buf_view);
+
+    /* update the number of bytes written since file open */
+    if (amnt >= 0) ncp->put_size += amnt;
+    /* else: ignore if error, as this error is not fatal */
+
+    return amnt;
+}
+
+/*----< ncmpio_getput_zero_req() >-------------------------------------------*/
+/* This function is called when this process has zero-length I/O request and
+ * must participate all the MPI collective calls involved in the collective
+ * APIs and wait_all(), which include setting fileview, collective read/write,
+ * another setting fileview.
+ *
+ * This function is collective.
+ */
+int
+ncmpio_getput_zero_req(NC *ncp, int reqMode)
+{
+    int err, status=NC_NOERR;
+    MPI_Offset rlen, wlen;
+    PNCIO_View buf_view;
+
+    buf_view.size = 0;
+
+    /* When intra-node aggregation is enabled, non-aggregators do not access
+     * the file.
+     */
+    if (ncp->num_aggrs_per_node > 0 && ncp->rank != ncp->my_aggr)
+        return NC_NOERR;
+
+    /* do nothing if this came from an independent API */
+    if (fIsSet(reqMode, NC_REQ_INDEP)) return NC_NOERR;
+
+    err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL);
+    if (status == NC_NOERR) status = err;
+
+    if (fIsSet(reqMode, NC_REQ_RD)) {
+        if (ncp->nprocs > 1)
+            rlen = ncmpio_file_read_at_all(ncp, 0, NULL, buf_view);
+        else
+            rlen = ncmpio_file_read_at(ncp, 0, NULL, buf_view);
+        if (status == NC_NOERR && rlen < 0) status = (int)rlen;
+    }
+    else { /* write request */
+        if (ncp->nprocs > 1)
+            wlen = ncmpio_file_write_at_all(ncp, 0, NULL, buf_view);
+        else
+            wlen = ncmpio_file_write_at(ncp, 0, NULL, buf_view);
+        if (status == NC_NOERR && wlen < 0) status = (int)wlen;
+    }
+
+    /* Reset fileview. Note fileview is never reused in PnetCDF */
+    ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL);
+
+    /* No longer need to reset the file view, as the root's fileview includes
+     * the whole file header.
+     */
+
+    return status;
+}
+
+/*----< ncmpio_read_write() >------------------------------------------------*/
+int
+ncmpio_read_write(NC         *ncp,
+                  int         rw_flag,     /* NC_REQ_WR or NC_REQ_RD */
+                  MPI_Offset  offset,
+                  PNCIO_View  buf_view,
+                  void       *buf)
+{
+    char *mpi_name;
+    int i, status=NC_NOERR, err=NC_NOERR, mpireturn, coll_indep;
+    int to_free_buftype=0;
+    MPI_Offset rlen, wlen;
+
+    coll_indep = NC_REQ_INDEP;
+    if (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP))
+        coll_indep = NC_REQ_COLL;
+
+    /* for zero-sized request */
+    if (buf_view.size == 0) {
+        if (coll_indep == NC_REQ_INDEP)
+            return NC_NOERR;
+
+        if (rw_flag == NC_REQ_RD) {
+            rlen = ncmpio_file_read_at_all(ncp, 0, NULL, buf_view);
+            if (rlen < 0) status = (int)rlen;
         }
         else {
-            /* update the number of bytes read since file open */
-#ifdef HAVE_MPI_GET_COUNT_C
-            MPI_Count get_size;
-            MPI_Get_count_c(&mpistatus, MPI_BYTE, &get_size);
-            ncp->get_size += get_size;
-#else
-            int get_size;
-            mpireturn = MPI_Get_count(&mpistatus, xbuf_type, &get_size);
-            if (mpireturn != MPI_SUCCESS || get_size == MPI_UNDEFINED)
-                ncp->get_size += req_size;
-            else {
-#ifdef HAVE_MPI_TYPE_SIZE_X
-                /* MPI_Type_size_x is introduced in MPI 3.0 */
-                mpireturn = MPI_Type_size_x(xbuf_type, &btype_size);
-#else
-                mpireturn = MPI_Type_size(xbuf_type, &btype_size);
+            wlen = ncmpio_file_write_at_all(ncp, 0, NULL, buf_view);
+            if (wlen < 0) status = (int)wlen;
+        }
+        goto fn_exit;
+    }
+
+    /* buf_view.count is the number of offset-length pairs */
+
+    /* buf_view.size is in bytes, may be > NC_MAX_INT */
+
+    if (rw_flag == NC_REQ_RD) {
+        void *xbuf=buf;
+
+#ifndef HAVE_MPI_LARGE_COUNT
+        if (buf_view.size > NC_MAX_INT) {
+#ifdef PNETCDF_DEBUG
+            fprintf(stderr,"%d: %s line %d:  NC_EINTOVERFLOW buffer size="OFFFMT"\n",
+                    ncp->rank, __func__,__LINE__,buf_view.size);
 #endif
-                if (mpireturn != MPI_SUCCESS || get_size == MPI_UNDEFINED)
-                    ncp->get_size += req_size;
-                else
-                    ncp->get_size += btype_size * get_size;
+            if (coll_indep == NC_REQ_COLL) {
+                DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW)
+                /* write nothing, but participate the collective call */
+                buf_view.size = 0;
             }
+            else
+                DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
+        }
 #endif
+
+// printf("%s at %d: buf_view count=%lld type=%s size=%lld\n",__func__,__LINE__, buf_view.count, (buf_view.type==MPI_BYTE)?"MPI_BYTE":"NOT MPI_BYTE", buf_view.size);
+
+        if (!buf_view.is_contig && buf_view.size <= ncp->ibuf_size) {
+            /* The only case of read buffer being noncontiguous is when
+             * nonblocking API ncmpi_wait/wait_all() is called and INA is
+             * disabled. If read buffer is noncontiguous and size is <
+             * ncp->ibuf_size, we allocate a temporary contiguous buffer and
+             * use it to read. Later it is unpacked to user buffer. As some
+             * MPI, e.g. Cray on KNL, can be significantly slow when write
+             * buffer is noncontiguous.
+             *
+             * Note ncp->ibuf_size is never > NC_MAX_INT.
+             */
+            xbuf = NCI_Malloc(buf_view.size);
+            buf_view.type = MPI_BYTE;
+            buf_view.is_contig = 1;
         }
-        if (xbuf != buf) { /* unpack contiguous xbuf to noncontiguous buf */
+
+        if (!buf_view.is_contig && ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+            /* construct a buftype */
 #ifdef HAVE_MPI_LARGE_COUNT
-            MPI_Count pos=0;
-            mpireturn = MPI_Unpack_c(xbuf, xlen, &pos, buf, (MPI_Count)buf_count,
-                                     buf_type, MPI_COMM_SELF);
-            mpi_name = "MPI_Unpack_c";
+            /* TODO: MPI_Type_create_hindexed_c
+             *       buf_view.count should be of type MPI_Count
+             *       buf_view.len   should be of type MPI_Count
+             *       buf_view.off   should be of type MPI_Count
+             */
+            mpireturn = MPI_Type_create_hindexed_c(buf_view.count,
+                                                   buf_view.len,
+                                                   buf_view.off,
+                                                   MPI_BYTE, &buf_view.type);
+            mpi_name = "MPI_Type_create_hindexed_c";
+#else
+            MPI_Aint *disp;
+#if SIZEOF_MPI_AINT == SIZEOF_MPI_OFFSET
+            disp = (MPI_Aint*) buf_view.off;
 #else
-            int pos=0;
-            mpireturn = MPI_Unpack(xbuf, xlen, &pos, buf, (int)buf_count,
-                                   buf_type, MPI_COMM_SELF);
-            mpi_name = "MPI_Unpack";
+            disp = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * buf_view.count);
+            for (j=0; j<buf_view.count; j++)
+                disp[j] = (MPI_Aint)buf_view.off[j];
+#endif
+            mpireturn = MPI_Type_create_hindexed(buf_view.count,
+                                                 buf_view.len,
+                                                 disp,
+                                                 MPI_BYTE, &buf_view.type);
+            mpi_name = "MPI_Type_create_hindexed";
+#if SIZEOF_MPI_AINT != SIZEOF_MPI_OFFSET
+            NCI_Free(disp);
+#endif
 #endif
             if (mpireturn != MPI_SUCCESS) {
                 err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-                if (coll_indep == NC_REQ_COLL)
-                    DEBUG_ASSIGN_ERROR(status, err)
+                /* return the first encountered error if there is any */
+                if (status == NC_NOERR) status = err;
+                buf_view.size = 0;
+            }
+            else {
+                mpireturn = MPI_Type_commit(&buf_view.type);
+                if (mpireturn != MPI_SUCCESS) {
+                    err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit");
+                    /* return the first encountered error if there is any */
+                    if (status == NC_NOERR) status = err;
+                    buf_view.size = 0;
+                }
                 else
-                    DEBUG_RETURN_ERROR(err)
+                    to_free_buftype = 1;
+            }
+        }
+
+        if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL)
+            rlen = ncmpio_file_read_at_all(ncp, offset, xbuf, buf_view);
+        else
+            rlen = ncmpio_file_read_at(ncp, offset, xbuf, buf_view);
+        if (status == NC_NOERR && rlen < 0) status = (int)rlen;
+
+        if (xbuf != buf) { /* unpack contiguous xbuf to noncontiguous buf */
+            char *in_ptr, *out_ptr;
+            in_ptr = xbuf;
+
+#if 0
+ long long *wkl, nelems; int j;
+ wkl = (long long*) malloc(buf_view.size);
+ nelems=buf_view.size/8;
+ memcpy(wkl, xbuf, nelems*8); ncmpii_in_swapn(wkl, nelems, 8);
+ printf("%s at %d: nelems=%lld xbuf=(%p) ",__func__,__LINE__, nelems, xbuf);
+ for (i=0; i<nelems; i++) printf(" %lld",wkl[i]);
+ printf("\n");
+ free(wkl);
+#endif
+
+assert(buf != NULL);
+            for (i=0; i<buf_view.count; i++) {
+                out_ptr = (char*)buf + buf_view.off[i]; // - buf_view.off[0]);
+                memcpy(out_ptr, in_ptr, buf_view.len[i]);
+                in_ptr += buf_view.len[i];
+#if 0
+ wkl = (long long*) malloc(buf_view.len[i]);
+ nelems=buf_view.len[i]/8;
+ memcpy(wkl, out_ptr, nelems*8); ncmpii_in_swapn(wkl, nelems, 8);
+ printf("%s at %d: buf_view.count=%lld i=%d nelems=%lld out_ptr=(%p) ",__func__,__LINE__, buf_view.count,i,nelems, out_ptr);
+ for (j=0; j<nelems; j++) printf(" %lld",wkl[j]);
+ printf("\n");
+ free(wkl);
+#endif
             }
             NCI_Free(xbuf);
         }
-        if (xbuf_type != buf_type && xbuf_type != MPI_BYTE)
-            MPI_Type_free(&xbuf_type);
+        if (to_free_buftype)
+            MPI_Type_free(&buf_view.type);
+
     } else { /* NC_REQ_WR */
-        void         *xbuf=buf;
-        MPI_Datatype  xbuf_type=buf_type;
+        void *xbuf=buf;
 
-#ifdef HAVE_MPI_LARGE_COUNT
-        MPI_Count xlen = (MPI_Count)buf_count;
-#else
-        int xlen = (int)buf_count;
-        if (buf_count > NC_MAX_INT) {
-            if (coll_indep == NC_REQ_COLL) {
-#ifdef PNETCDF_DEBUG
-                fprintf(stderr,"%d: %s line %d:  NC_EINTOVERFLOW buf_count="OFFFMT"\n",
-                        ncp->rank, __func__,__LINE__,buf_count);
+        if (!buf_view.is_contig && buf_view.size <= ncp->ibuf_size) {
+            /* The only case of write buffer being noncontiguous is when
+             * nonblocking API ncmpi_wait/wait_all() is called and INA is
+             * disabled. If write buffer is noncontiguous and size is <
+             * ncp->ibuf_size, pack it a temporary contiguous buffer and use it
+             * to write. As some MPI, e.g. Cray on KNL, can be significantly
+             * slow when write buffer is noncontiguous.
+             *
+             * Note ncp->ibuf_size is never > NC_MAX_INT.
+             */
+            char *in_ptr, *out_ptr;
+            xbuf = NCI_Malloc(buf_view.size);
+            out_ptr = xbuf;
+assert(buf != NULL);
+// printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size);
+
+#if 0
+printf("%s at %d: buf = %p\n",__func__,__LINE__, buf);
+printf("%s at %d: buf_view count=%lld off=%lld %lld len=%lld %lld\n",__func__,__LINE__, buf_view.count,buf_view.off[0],buf_view.off[1],buf_view.len[0],buf_view.len[1]);
+int wkl[21];
 #endif
-                DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW)
-                /* write nothing, but participate the collective call */
-                xlen = 0;
+            for (i=0; i<buf_view.count; i++) {
+                in_ptr = (char*)buf + (buf_view.off[i] - buf_view.off[0]);
+#if 0
+memcpy(wkl, in_ptr, buf_view.len[i]);
+ncmpii_in_swapn(wkl, buf_view.len[i]/4, 4);
+printf("%s at %d: [%lld] in_ptr=(%p) %d %d %d %d %d\n",__func__,__LINE__, buf_view.len[i]/4, in_ptr, wkl[0],wkl[1],wkl[2],wkl[3],wkl[4]);
+#endif
+                memcpy(out_ptr, in_ptr, buf_view.len[i]);
+                out_ptr += buf_view.len[i];
             }
-            else
-                DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
-        }
+            /* mark the xbuf is contiguous */
+            buf_view.type = MPI_BYTE;
+            buf_view.is_contig = 1;
+#if 0
+memcpy(wkl, xbuf, 84);
+ncmpii_in_swapn(wkl, 21, 4);
+printf("%s at %d: size=%lld xbuf=(%p) %d %d %d %d %d\n",__func__,__LINE__, buf_view.size, xbuf, wkl[0],wkl[1],wkl[2],wkl[3],wkl[4]);
+printf("%s at %d: wkl[15] = %d %d %d %d %d\n",__func__,__LINE__, wkl[15],wkl[16],wkl[17],wkl[18],wkl[19]);
 #endif
+        }
 
-        if (xlen > 0 && !buftype_is_contig && req_size <= ncp->ibuf_size) {
-            /* if write buffer is noncontiguous and size is < ncp->ibuf_size,
-             * allocate a temporary buffer and use it to write, as some MPI,
-             * e.g. Cray on KNL, can be significantly slow when write buffer is
-             * noncontiguous.
-             */
+        if (!buf_view.is_contig && ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+            /* construct a buftype */
 #ifdef HAVE_MPI_LARGE_COUNT
-            MPI_Count pos=0;
-            xbuf_type = MPI_BYTE;
-            xlen = (MPI_Count)req_size;
-            xbuf = NCI_Malloc(req_size);
-            mpireturn = MPI_Pack_c(buf, (MPI_Count)buf_count, buf_type, xbuf,
-                                   (MPI_Count)req_size, &pos, MPI_COMM_SELF);
-            mpi_name = "MPI_Pack_c";
+            /* TODO: MPI_Type_create_hindexed_c
+             *       buf_view.count should be of type MPI_Count
+             *       buf_view.len   should be of type MPI_Count
+             *       buf_view.off   should be of type MPI_Count
+             */
+            mpireturn = MPI_Type_create_hindexed_c(buf_view.count,
+                                                   buf_view.len,
+                                                   buf_view.off,
+                                                   MPI_BYTE, &buf_view.type);
+            mpi_name = "MPI_Type_create_hindexed_c";
 #else
-            if (req_size > NC_MAX_INT) {
-                /* skip packing write data into a temp buffer */
-                xlen = (int)buf_count;
-                xbuf_type = buf_type;
-                mpireturn = MPI_SUCCESS;
-            }
-            else {
-                int pos=0;
-                xbuf_type = MPI_BYTE;
-                xlen = (int)req_size;
-                xbuf = NCI_Malloc(xlen);
-                mpireturn = MPI_Pack(buf, (int)buf_count, buf_type, xbuf,
-                                     xlen, &pos, MPI_COMM_SELF);
-                mpi_name = "MPI_Pack";
-            }
+            MPI_Aint *disp;
+#if SIZEOF_MPI_AINT == SIZEOF_MPI_OFFSET
+            disp = (MPI_Aint*) buf_view.off;
+#else
+            disp = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * buf_view.count);
+            for (j=0; j<buf_view.count; j++)
+                disp[j] = (MPI_Aint)buf_view.off[j];
+#endif
+            /* TODO: MPI_Type_create_hindexed
+             *       buf_view.count should be of type int
+             *       buf_view.len   should be of type int
+             *       buf_view.off   should be of type MPI_Aint
+             */
+            mpireturn = MPI_Type_create_hindexed(buf_view.count,
+                                                 buf_view.len,
+                                                 disp,
+                                                 MPI_BYTE, &buf_view.type);
+            mpi_name = "MPI_Type_create_hindexed";
+#if SIZEOF_MPI_AINT != SIZEOF_MPI_OFFSET
+            NCI_Free(disp);
+#endif
 #endif
             if (mpireturn != MPI_SUCCESS) {
                 err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-                if (coll_indep == NC_REQ_COLL)
-                    DEBUG_ASSIGN_ERROR(status, err)
+                /* return the first encountered error if there is any */
+                if (status == NC_NOERR) status = err;
+                buf_view.size = 0;
+            }
+            else {
+                mpireturn = MPI_Type_commit(&buf_view.type);
+                if (mpireturn != MPI_SUCCESS) {
+                    err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit");
+                    /* return the first encountered error if there is any */
+                    if (status == NC_NOERR) status = err;
+                    buf_view.size = 0;
+                }
                 else
-                    DEBUG_RETURN_ERROR(err)
+                    to_free_buftype = 1;
             }
         }
 
-        if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL) {
+        if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL)
+            wlen = ncmpio_file_write_at_all(ncp, offset, xbuf, buf_view);
+        else
+            wlen = ncmpio_file_write_at(ncp, offset, xbuf, buf_view);
+        if (status == NC_NOERR && wlen < 0) status = (int)wlen;
+
+        if (xbuf != buf) NCI_Free(xbuf);
+        if (to_free_buftype)
+            MPI_Type_free(&buf_view.type);
+    }
+
+fn_exit:
+    /* Reset fileview. Note fileview is never reused in PnetCDF */
+    ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL);
+
+    return status;
+}
+
+/*----< ncmpio_file_close() >------------------------------------------------*/
+/*
+ * This function is collective.
+ */
+int
+ncmpio_file_close(NC *ncp)
+{
+    int err=NC_NOERR;
+
+    if (ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+        char *mpi_name;
+        int mpireturn;
+
+        if (ncp->independent_fh != ncp->collective_fh &&
+            ncp->independent_fh != MPI_FILE_NULL) {
+            TRACE_IO(MPI_File_close, (&ncp->independent_fh));
+            if (mpireturn != MPI_SUCCESS)
+                err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+        }
+
+        if (ncp->collective_fh != MPI_FILE_NULL) {
+            TRACE_IO(MPI_File_close, (&ncp->collective_fh));
+            if (mpireturn != MPI_SUCCESS)
+                err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+        }
+    }
+    else {
+        /* When intra-node aggregation is enabled, only aggregators have a
+         * non-NULL ncp->pncio_fh and non-aggregators has pncio_fh == NULL.
+         */
+        if (ncp->pncio_fh != NULL) {
+            err = PNCIO_File_close(ncp->pncio_fh);
+            NCI_Free(ncp->pncio_fh);
+            ncp->pncio_fh = NULL;
+        }
+    }
+
+    return err;
+}
+
+/*----< ncmpio_file_delete() >-----------------------------------------------*/
+/*
+ * This function is collective.
+ *
+ * This subroutine is called only from ncmpi_abort. When the file is being
+ * created and an error occurs, the program is still in define mode. In this
+ * case, the file is deleted.
+ */
+int
+ncmpio_file_delete(NC *ncp)
+{
+    int err=NC_NOERR;
+
+    if (ncp->rank == 0) {
+        if (ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+            char *mpi_name;
+            int mpireturn;
+            TRACE_IO(MPI_File_delete, ((char *)ncp->path, ncp->mpiinfo));
+            if (mpireturn != MPI_SUCCESS)
+                err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+        }
+        else
+            err = PNCIO_File_delete(ncp->path);
+    }
+
+    if (ncp->nprocs > 1)
+        MPI_Bcast(&err, 1, MPI_INT, 0, ncp->comm);
+
+    return err;
+}
+
+/*----< ncmpio_file_sync() >-------------------------------------------------*/
+/* This function must be called collectively, no matter if it is in collective
+ * or independent data mode.
+ */
+int
+ncmpio_file_sync(NC *ncp) {
+    char *mpi_name;
+    int mpireturn;
+
+    if (ncp->fstype != PNCIO_FSTYPE_MPIIO) {
+        if (ncp->pncio_fh == NULL)
+            return NC_NOERR;
+        return PNCIO_File_sync(ncp->pncio_fh);
+    }
+
+    /* the remaining of this subroutine are for when using MPI-IO */
+
+    if (ncp->independent_fh != MPI_FILE_NULL) {
+        TRACE_IO(MPI_File_sync, (ncp->independent_fh));
+        if (mpireturn != MPI_SUCCESS)
+            return ncmpii_error_mpi2nc(mpireturn, mpi_name);
+    }
+    /* when nprocs == 1, ncp->collective_fh == ncp->independent_fh */
+    if (ncp->nprocs == 1) return NC_NOERR;
+
+    /* When intra-node aggregation is enabled, non-aggregator's
+     * ncp->collective_fh is always MPI_FILE_NULL. When disabled,
+     * ncp->collective_fh on all ranks is never MPI_FILE_NULL as collective
+     * mode is default in PnetCDF.
+     */
+    if (ncp->collective_fh != MPI_FILE_NULL) {
+        TRACE_IO(MPI_File_sync, (ncp->collective_fh));
+        if (mpireturn != MPI_SUCCESS)
+            return ncmpii_error_mpi2nc(mpireturn, mpi_name);
+    }
+
+    /* Barrier is not necessary ...
+      TRACE_COMM(MPI_Barrier)(ncp->comm);
+     */
+
+    return NC_NOERR;
+}
+
+/*----< ncmpio_file_set_view() >---------------------------------------------*/
+/* This subroutine is collective when using MPI-IO. When using internal PNCIO
+ * driver, this subroutine is independent.
+ */
+int
+ncmpio_file_set_view(const NC     *ncp,
+                     MPI_Offset    disp,    /* IN/OUT */
+                     MPI_Datatype  filetype,
+                     MPI_Aint      npairs,
 #ifdef HAVE_MPI_LARGE_COUNT
-            TRACE_IO(MPI_File_write_at_all_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus));
+                     MPI_Count    *offsets,
+                     MPI_Count    *lengths
 #else
-            TRACE_IO(MPI_File_write_at_all, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus));
+                     MPI_Offset   *offsets,
+                     int          *lengths
 #endif
-        } else {
+)
+{
+    char *mpi_name;
+    int err, mpireturn, status=NC_NOERR;
+    MPI_File fh;
+
+assert(filetype == MPI_BYTE);
+assert(disp == 0);
+
+    if (ncp->fstype != PNCIO_FSTYPE_MPIIO) {
+        /* Skip setting fileview for ranks whose pncio_fh is NULL */
+        if (ncp->pncio_fh == NULL)
+            return NC_NOERR;
+
+        /* When PnetCDF's internal PNCIO driver is used, the request has been
+         * flattened into offsets and lengths. Thus passed-in filetype is not
+         * constructed. Note offsets and lengths are not relative to any MPI-IO
+         * fileview. They will be reused in PNCIO driver as a flattened file
+         * type struct, which avoids repeated work of constructing and
+         * flattening the filetype.
+         */
+        return PNCIO_File_set_view(ncp->pncio_fh, disp, filetype, npairs,
+                                   offsets, lengths);
+    }
+
+    /* Now, ncp->fstype == PNCIO_FSTYPE_MPIIO, i.e. using MPI-IO. */
+    int to_free_filetype=0;
+
+    /* when ncp->nprocs == 1, ncp->collective_fh == ncp->independent_fh */
+    fh = (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP))
+       ? ncp->collective_fh : ncp->independent_fh;
+
+    if (fh == MPI_FILE_NULL) /* not INA aggregator */
+        return NC_NOERR;
+
+    if (npairs == 0) /* zero-sized requests */
+        filetype = MPI_BYTE;
+    else {
 #ifdef HAVE_MPI_LARGE_COUNT
-            TRACE_IO(MPI_File_write_at_c, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus));
+        /* construct fileview */
+        mpireturn = MPI_Type_create_hindexed_c(npairs, lengths, offsets,
+                                               MPI_BYTE, &filetype);
 #else
-            TRACE_IO(MPI_File_write_at, (fh, offset, xbuf, xlen, xbuf_type, &mpistatus));
+        assert(sizeof(*offsets) == sizeof(MPI_Aint));
+        /* construct fileview */
+        mpireturn = MPI_Type_create_hindexed(npairs, lengths,
+                                             (MPI_Aint*)offsets,
+                                             MPI_BYTE, &filetype);
 #endif
-        }
         if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+            err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed");
             /* return the first encountered error if there is any */
-            if (status == NC_NOERR) {
-                err = (err == NC_EFILE) ? NC_EWRITE : err;
-                DEBUG_ASSIGN_ERROR(status, err)
-            }
+            if (status == NC_NOERR) status = err;
         }
         else {
-            /* update the number of bytes written since file open */
-#ifdef HAVE_MPI_GET_COUNT_C
-            MPI_Count put_size;
-            MPI_Get_count_c(&mpistatus, MPI_BYTE, &put_size);
-            ncp->put_size += put_size;
-#else
-            int put_size;
-            mpireturn = MPI_Get_count(&mpistatus, xbuf_type, &put_size);
-            if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED)
-                ncp->put_size += req_size;
-            else {
-#ifdef HAVE_MPI_TYPE_SIZE_X
-                /* MPI_Type_size_x is introduced in MPI 3.0 */
-                mpireturn = MPI_Type_size_x(xbuf_type, &btype_size);
-#else
-                mpireturn = MPI_Type_size(xbuf_type, &btype_size);
-#endif
-                if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED)
-                    ncp->put_size += req_size;
-                else
-                    ncp->put_size += btype_size * put_size;
+            mpireturn = MPI_Type_commit(&filetype);
+            if (mpireturn != MPI_SUCCESS) {
+                err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit");
+                /* return the first encountered error if there is any */
+                if (status == NC_NOERR) status = err;
             }
-#endif
+            else
+                to_free_filetype = 1;
         }
-        if (xbuf != buf) NCI_Free(xbuf);
-        if (xbuf_type != buf_type && xbuf_type != MPI_BYTE)
-            MPI_Type_free(&xbuf_type);
     }
 
+    TRACE_IO(MPI_File_set_view, (fh, disp, MPI_BYTE, filetype, "native",
+                                 MPI_INFO_NULL));
+    if (mpireturn != MPI_SUCCESS) {
+        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+        if (status == NC_NOERR) status = err;
+assert(0);
+    }
+
+    if (to_free_filetype)
+        MPI_Type_free(&filetype);
+
     return status;
 }
 
+/*----< ncmpio_file_open() >-------------------------------------------------*/
+int
+ncmpio_file_open(NC         *ncp,
+                 MPI_Comm    comm,
+                 const char *path,
+                 int         omode,
+                 MPI_Info    info)
+{
+    int err=NC_NOERR;
+
+    /* open file collectively */
+    if (ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+        char *mpi_name;
+        int mpireturn;
+        MPI_File fh;
+
+        TRACE_IO(MPI_File_open, (comm, path, omode, info, &fh));
+        if (mpireturn != MPI_SUCCESS)
+            return ncmpii_error_mpi2nc(mpireturn, mpi_name);
+
+        /* Now the file has been successfully opened */
+        ncp->collective_fh  = fh;
+        ncp->independent_fh = (ncp->nprocs > 1) ? MPI_FILE_NULL : fh;
+
+        /* get the I/O hints used/modified by MPI-IO */
+        TRACE_IO(MPI_File_get_info, (fh, &ncp->mpiinfo));
+        if (mpireturn != MPI_SUCCESS)
+            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+    }
+    else { /* ncp->fstype != PNCIO_FSTYPE_MPIIO */
+        ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File));
+
+        err = PNCIO_File_open(comm, path, omode, info, ncp->pncio_fh);
+        if (err != NC_NOERR) return err;
+
+        /* Now the file has been successfully opened, obtain the I/O hints
+         * used/modified by PNCIO driver.
+         */
+        err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo);
+    }
+
+    return err;
+}
+
diff --git a/src/drivers/ncmpio/ncmpio_file_misc.c b/src/drivers/ncmpio/ncmpio_file_misc.c
index 932b5027f..d936aa643 100644
--- a/src/drivers/ncmpio/ncmpio_file_misc.c
+++ b/src/drivers/ncmpio/ncmpio_file_misc.c
@@ -81,8 +81,7 @@ dup_NC(const NC *ref)
 int
 ncmpio_redef(void *ncdp)
 {
-    char *mpi_name;
-    int err, status=NC_NOERR, mpireturn;
+    int err, status=NC_NOERR;
     NC *ncp = (NC*)ncdp;
 
 #if 0
@@ -100,7 +99,7 @@ ncmpio_redef(void *ncdp)
     if (NC_indep(ncp)) /* exit independent mode, if in independent mode */
         ncmpio_end_indep_data(ncp);
 
-    /* duplicate a header to be used in enddef() for checking if header grows */
+    /* duplicate header to be used in enddef() for checking if header grows */
     ncp->old = dup_NC(ncp);
     if (ncp->old == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM)
 
@@ -108,21 +107,8 @@ ncmpio_redef(void *ncdp)
     fSet(ncp->flags, NC_MODE_DEF);
 
     /* must reset fileview as header extent may later change in enddef() */
-    TRACE_IO(MPI_File_set_view, (ncp->collective_fh, 0, MPI_BYTE,
-                                 MPI_BYTE, "native", MPI_INFO_NULL));
-    if (mpireturn != MPI_SUCCESS) {
-        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-        DEBUG_ASSIGN_ERROR(status, err)
-    }
-
-    if (ncp->independent_fh != MPI_FILE_NULL) {
-        TRACE_IO(MPI_File_set_view, (ncp->independent_fh, 0, MPI_BYTE,
-                                     MPI_BYTE, "native", MPI_INFO_NULL));
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-            DEBUG_ASSIGN_ERROR(status, err)
-        }
-    }
+    err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL);
+    DEBUG_ASSIGN_ERROR(status, err)
 
     return status;
 }
@@ -132,7 +118,6 @@ ncmpio_redef(void *ncdp)
 int
 ncmpio_begin_indep_data(void *ncdp)
 {
-    char *mpi_name;
     NC *ncp = (NC*)ncdp;
 
     if (NC_indef(ncp))  /* must not be in define mode */
@@ -151,6 +136,66 @@ ncmpio_begin_indep_data(void *ncdp)
     /* raise independent flag */
     fSet(ncp->flags, NC_MODE_INDEP);
 
+    /* Barrier is necessary to prevent non-aggregators from calling open()
+     * before the file is being collectively created by the aggregators.
+     */
+    MPI_Barrier(ncp->comm);
+
+    if (ncp->fstype != PNCIO_FSTYPE_MPIIO) {
+        /* When using PnetCDF's PNCIO driver, there are 2 scenarios:
+         * 1. When intra-node aggregation (INA) is enabled, at the end of
+         *    ncmpi_create/ncmpi_open, non-aggregators' pncio_fh are NULL. Thus
+         *    switching to independent data mode, we can re-use pncio_fh to
+         *    store file handler of file opened with MPI_COMM_SELF. Note
+         *    whether pncio_fh is NULL or not does not tell whether INA is
+         *    enabled or not.
+         * 2. When INA is disabled, all ranks calls PNCIO_File_open() and thus
+         *    pncio_fh should not be NULL. In other word, this scenario should
+         *    not reach here at all. Because PnetCDF's PNCIO driver relaxes
+         *    File_setview subroutine to be able to called independently, the
+         *    same pncio_fh can be used for both collective and independent I/O
+         *    APIs. Note we cannot re-used pncio_fh for the above scenario 1,
+         *    because in the collective data mode, all ranks must participate
+         *    each collective I/O call,
+         */
+        int err;
+        char *filename;
+
+        if (ncp->pncio_fh != NULL)
+            /* Only INA non-aggregators' pncio_fh can be NULL, because
+             * aggregators open the file collectively and their pncio_fh can
+             * never be NULL.
+             */
+            return NC_NOERR;
+
+        filename = ncmpii_remove_file_system_type_prefix(ncp->path);
+
+        ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File));
+        ncp->pncio_fh->file_system = ncp->fstype;
+        ncp->pncio_fh->num_nodes = 1;
+        ncp->pncio_fh->node_ids = (int*) NCI_Malloc(sizeof(int));
+        ncp->pncio_fh->node_ids[0] = 0;
+
+        int omode = fClr(ncp->mpiomode, MPI_MODE_CREATE);
+
+        err = PNCIO_File_open(MPI_COMM_SELF, filename, omode, ncp->mpiinfo,
+                              ncp->pncio_fh);
+        if (err != NC_NOERR)
+            return err;
+
+        /* get the I/O hints used/modified by MPI-IO */
+        err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo);
+        if (err != NC_NOERR) return err;
+
+        /* Add PnetCDF hints into ncp->mpiinfo */
+        ncmpio_hint_set(ncp, ncp->mpiinfo);
+
+        NCI_Free(ncp->pncio_fh->node_ids);
+        ncp->pncio_fh->node_ids = NULL;
+
+        return NC_NOERR;
+    }
+
     /* PnetCDF's default mode is collective. MPI file handle, collective_fh,
      * will never be MPI_FILE_NULL. We must use a separate MPI file handle
      * opened with MPI_COMM_SELF, because MPI_File_set_view is a collective
@@ -159,12 +204,20 @@ ncmpio_begin_indep_data(void *ncdp)
      * called.
      */
     if (ncp->independent_fh == MPI_FILE_NULL) {
+        char *mpi_name;
         int mpireturn;
-        TRACE_IO(MPI_File_open, (MPI_COMM_SELF, ncp->path,
-                                 ncp->mpiomode, ncp->mpiinfo,
-                                 &ncp->independent_fh));
+        TRACE_IO(MPI_File_open, (MPI_COMM_SELF, ncp->path, ncp->mpiomode,
+                                 ncp->mpiinfo, &ncp->independent_fh));
+        if (mpireturn != MPI_SUCCESS)
+            return ncmpii_error_mpi2nc(mpireturn, mpi_name);
+
+        /* get the I/O hints used/modified by MPI-IO */
+        mpireturn = MPI_File_get_info(ncp->independent_fh, &ncp->mpiinfo);
         if (mpireturn != MPI_SUCCESS)
             return ncmpii_error_mpi2nc(mpireturn, mpi_name);
+
+        /* Copy MPI-IO hints into ncp->mpiinfo */
+        ncmpio_hint_set(ncp, ncp->mpiinfo);
     }
     return NC_NOERR;
 }
@@ -242,9 +295,14 @@ ncmpio_abort(void *ncdp)
     }
 
     /* close the file */
-    err = ncmpio_close_files(ncp, doUnlink);
+    err = ncmpio_file_close(ncp);
     if (status == NC_NOERR ) status = err;
 
+    if (doUnlink) {
+        err = ncmpio_file_delete(ncp);
+        status = (status == NC_NOERR) ? err : status;
+    }
+
     /* free up space occupied by the header metadata */
     ncmpio_free_NC(ncp);
 
@@ -444,12 +502,23 @@ int
 ncmpi_delete(const char *filename,
              MPI_Info    info)
 {
+    int err = NC_NOERR;
+#ifdef MIMIC_LUSTRE
+    char *path = ncmpii_remove_file_system_type_prefix(filename);
+    err = unlink(path);
+    if (err != 0)
+        err = ncmpii_error_posix2nc("unlink");
+#else
+    err = PNCIO_File_delete(filename);
+#if 0
     char *mpi_name;
-    int err=NC_NOERR, mpireturn;
+    int mpireturn;
 
-    TRACE_IO(MPI_File_delete, ((char*)filename, info));
+    TRACE_IO(MPI_File_delete, (filename, info));
     if (mpireturn != MPI_SUCCESS)
         err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+#endif
+#endif
     return err;
 }
 
diff --git a/src/drivers/ncmpio/ncmpio_filetype.c b/src/drivers/ncmpio/ncmpio_filetype.c
index 828ab4132..3d84d5407 100644
--- a/src/drivers/ncmpio/ncmpio_filetype.c
+++ b/src/drivers/ncmpio/ncmpio_filetype.c
@@ -506,6 +506,9 @@ ncmpio_filetype_create_vars(const NC         *ncp,
     MPI_Offset    i, nblocks, nelems, *blocklens;
     MPI_Datatype  filetype=MPI_BYTE;
 
+/* This is no longer used, as all requests go to INA subroutines to flatten. */
+assert(0);
+
     if (stride == NULL)
         return filetype_create_vara(ncp, varp, start, count, offset_ptr,
                                     filetype_ptr, is_filetype_contig);
@@ -606,105 +609,3 @@ ncmpio_filetype_create_vars(const NC         *ncp,
     return err;
 }
 
-/*----< ncmpio_file_set_view() >---------------------------------------------*/
-/* This function handles the special case for root process for setting its
- * file view: to keeps the whole file header visible to the root process. This
- * is because the root process may update the number of records or attributes
- * into the file header while in data mode. In PnetCDF design, only root
- * process can read/write the file header.
- * This function is collective if called in collective data mode
- */
-int
-ncmpio_file_set_view(const NC     *ncp,
-                     MPI_File      fh,
-                     MPI_Offset   *offset,  /* IN/OUT */
-                     MPI_Datatype  filetype)
-{
-    char *mpi_name;
-    int err, mpireturn, status=NC_NOERR;
-
-    if (filetype == MPI_BYTE) {
-        /* filetype is a contiguous space, make the whole file visible */
-        TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE,
-                                     "native", MPI_INFO_NULL));
-        return NC_NOERR;
-    }
-
-    if (ncp->rank == 0) {
-        /* prepend the whole file header to filetype */
-        MPI_Datatype root_filetype=MPI_BYTE, ftypes[2];
-#ifdef HAVE_MPI_LARGE_COUNT
-        MPI_Count blocklens[2];
-        MPI_Count disps[2];
-        blocklens[0] = ncp->begin_var;
-#else
-        int blocklens[2];
-        MPI_Aint disps[2];
-
-        /* check if header size > 2^31 */
-        if (ncp->begin_var > NC_MAX_INT) {
-            DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW);
-            goto err_out;
-        }
-
-        blocklens[0] = (int)ncp->begin_var;
-#endif
-
-        /* first block is the header extent */
-            disps[0] = 0;
-           ftypes[0] = MPI_BYTE;
-
-        /* second block is filetype, the subarray request(s) to the variable */
-        blocklens[1] = 1;
-            disps[1] = *offset;
-           ftypes[1] = filetype;
-
-#if !defined(HAVE_MPI_LARGE_COUNT) && (SIZEOF_MPI_AINT != SIZEOF_MPI_OFFSET)
-        if (*offset > NC_MAX_INT) {
-            DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW);
-            goto err_out;
-        }
-#endif
-
-#ifdef HAVE_MPI_LARGE_COUNT
-        mpireturn = MPI_Type_create_struct_c(2, blocklens, disps, ftypes,
-                                             &root_filetype);
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct_c");
-            if (status == NC_NOERR) status = err;
-        }
-#else
-        mpireturn = MPI_Type_create_struct(2, blocklens, disps, ftypes,
-                                           &root_filetype);
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct");
-            if (status == NC_NOERR) status = err;
-        }
-#endif
-        MPI_Type_commit(&root_filetype);
-
-#ifndef HAVE_MPI_LARGE_COUNT
-err_out:
-#endif
-        TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, root_filetype, "native",
-                                     MPI_INFO_NULL));
-        if (root_filetype != MPI_BYTE)
-            MPI_Type_free(&root_filetype);
-
-        /* now update the explicit offset to be used in MPI-IO call later */
-        *offset = ncp->begin_var;
-    }
-    else {
-        TRACE_IO(MPI_File_set_view, (fh, *offset, MPI_BYTE, filetype, "native",
-                                     MPI_INFO_NULL));
-        /* the explicit offset is already set in fileview */
-        *offset = 0;
-    }
-    if (mpireturn != MPI_SUCCESS) {
-        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-        if (status == NC_NOERR) status = err;
-    }
-
-    return status;
-}
-
diff --git a/src/drivers/ncmpio/ncmpio_fill.c b/src/drivers/ncmpio/ncmpio_fill.c
index e392a366d..ef6040d1b 100644
--- a/src/drivers/ncmpio/ncmpio_fill.c
+++ b/src/drivers/ncmpio/ncmpio_fill.c
@@ -144,13 +144,33 @@ fill_var_rec(NC         *ncp,
              NC_var     *varp,
              MPI_Offset  recno) /* record number */
 {
-    char *mpi_name;
     int err, status=NC_NOERR, mpireturn;
     void *buf;
-    MPI_Offset var_len, start, count, offset;
-    MPI_File fh;
-    MPI_Status mpistatus;
-    MPI_Datatype bufType;
+    MPI_Offset var_len, start, count, offset, wlen;
+    PNCIO_View buf_view;
+
+    buf_view.type = MPI_BYTE;
+    buf_view.count = 0;
+    buf_view.is_contig = 1;
+    buf_view.size = 0;
+    buf_view.off = NULL;
+    buf_view.len = NULL;
+
+    /* When intra-node aggregation is enabled, use the communicator consisting
+     * of aggregators in comm, nprocs, and rank. Non-aggregators do not
+     * participate the fill operation.
+     */
+    MPI_Comm comm = ncp->comm;
+    int nprocs = ncp->nprocs;
+    int rank = ncp->rank;
+    if (ncp->num_aggrs_per_node > 0) {
+        if (ncp->my_aggr != ncp->rank)
+            return NC_NOERR;
+
+        comm = ncp->ina_comm;
+        nprocs = ncp->ina_nprocs;
+        rank = ncp->ina_rank;
+    }
 
     if (varp->ndims == 0) /* scalar variable */
         var_len = 1;
@@ -162,14 +182,14 @@ fill_var_rec(NC         *ncp,
         var_len = varp->dsizes[0];
 
     /* divide total number of elements of this variable among all processes */
-    count = var_len / ncp->nprocs;
-    start = count * ncp->rank;
-    if (ncp->rank < var_len % ncp->nprocs) {
-        start += ncp->rank;
+    count = var_len / nprocs;
+    start = count * rank;
+    if (rank < var_len % nprocs) {
+        start += rank;
         count++;
     }
     else {
-        start += var_len % ncp->nprocs;
+        start += var_len % nprocs;
     }
 
     /* allocate buffer space */
@@ -179,64 +199,45 @@ fill_var_rec(NC         *ncp,
     err = fill_var_buf(varp, count, buf);
     if (err != NC_NOERR) {
         NCI_Free(buf);
-        count = 0; /* still participate collective calls below */
+        /* still participate collective calls below */
+        buf_view.size = 0;
         status = err;
     }
 
+    /* make the entire file visible */
+    err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL);
+    status = (status == NC_NOERR) ? err : status;
+
     /* calculate the starting file offset for each process */
     offset = varp->begin;
     if (IS_RECVAR(varp))
         offset += ncp->recsize * recno;
     offset += start * varp->xsz;
 
-    /* when ncp->nprocs == 1, we keep I/O mode in independent mode at all time */
-    fh = ncp->collective_fh;
-
-    /* make the entire file visible */
-    TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native",
-                                 MPI_INFO_NULL));
-    if (mpireturn != MPI_SUCCESS) {
-        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-        if (status == NC_NOERR) status = err;
-    }
-
     count *= varp->xsz;
 
-    bufType = MPI_BYTE;
-
 #ifndef HAVE_MPI_LARGE_COUNT
     if (count > NC_MAX_INT) {
         DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
         if (status == NC_NOERR) status = err;
-        count = 0; /* participate collective write with 0-length request */
+        /* participate collective write with 0-length request */
+        buf_view.size = 0;
     }
 #endif
 
+    if (status == NC_NOERR)
+        buf_view.size = count;
+
+// if (ncp->rank ==0) printf("%s at %d: buf_view count=%lld size=%lld offset=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size,offset);
+
     /* write to variable collectively */
-    if (ncp->nprocs > 1) {
-#ifdef HAVE_MPI_LARGE_COUNT
-        TRACE_IO(MPI_File_write_at_all_c, (fh, offset, buf, (MPI_Count)count,
-                                           bufType, &mpistatus));
-#else
-        TRACE_IO(MPI_File_write_at_all, (fh, offset, buf, (int)count,
-                                         bufType, &mpistatus));
-#endif
-    }
-    else {
-#ifdef HAVE_MPI_LARGE_COUNT
-        TRACE_IO(MPI_File_write_at_c, (fh, offset, buf, (MPI_Count)count,
-                                       bufType, &mpistatus));
-#else
-        TRACE_IO(MPI_File_write_at, (fh, offset, buf, (int)count,
-                                     bufType, &mpistatus));
-#endif
-    }
+    if (nprocs > 1)
+        wlen = ncmpio_file_write_at_all(ncp, offset, buf, buf_view);
+    else
+        wlen = ncmpio_file_write_at(ncp, offset, buf, buf_view);
+    if (status == NC_NOERR && wlen < 0) status = (int)wlen;
+
     NCI_Free(buf);
-    if (bufType != MPI_BYTE) MPI_Type_free(&bufType);
-    if (mpireturn != MPI_SUCCESS) {
-        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-        if (status == NC_NOERR) status = err;
-    }
 
     if (status != NC_NOERR) return status;
 
@@ -248,9 +249,9 @@ fill_var_rec(NC         *ncp,
          * First, find the max numrecs among all processes.
          */
         MPI_Offset max_numrecs=recno+1;
-        if (ncp->nprocs > 1) {
+        if (nprocs > 1) {
             TRACE_COMM(MPI_Allreduce)(MPI_IN_PLACE, &max_numrecs, 1, MPI_OFFSET,
-                                      MPI_MAX, ncp->comm);
+                                      MPI_MAX, comm);
             if (mpireturn != MPI_SUCCESS) {
                 err = ncmpii_error_mpi2nc(mpireturn, "MPI_Allreduce");
                 if (status == NC_NOERR) status = err;
@@ -363,24 +364,36 @@ fill_added_recs(NC *ncp, NC *old_ncp)
 static int
 fillerup_aggregate(NC *ncp, NC *old_ncp)
 {
-    int i, j, k, mpireturn, err, status=NC_NOERR;
+    int i, j, k, err, status=NC_NOERR;
     int start_vid, recno, nVarsFill;
-    char *buf_ptr, *noFill, *mpi_name;
+    char *buf_ptr, *noFill;
     void *buf;
     size_t nsegs;
-    MPI_Offset buf_len, var_len, nrecs, start, *count;
-    MPI_Datatype filetype, bufType;
-    MPI_File fh;
-    MPI_Status mpistatus;
+    MPI_Offset buf_len, var_len, nrecs, start, *count, wlen;
     NC_var *varp;
+    PNCIO_View buf_view;
 
 #ifdef HAVE_MPI_LARGE_COUNT
-    MPI_Count *blocklengths, *offset;
+    MPI_Count *blocklengths=NULL, *offset=NULL;
 #else
-    int *blocklengths;
-    MPI_Aint *offset;
+    int *blocklengths=NULL;
+    MPI_Offset *offset=NULL;
 #endif
 
+    /* When intra-node aggregation is enabled, use the communicator consisting
+     * of aggregators in comm, nprocs, and rank. Non-aggregators do not
+     * participate the fill operation.
+     */
+    int nprocs = ncp->nprocs;
+    int rank = ncp->rank;
+    if (ncp->num_aggrs_per_node > 0) {
+        if (ncp->my_aggr != ncp->rank)
+            return NC_NOERR;
+
+        nprocs = ncp->ina_nprocs;
+        rank = ncp->ina_rank;
+    }
+
     /* find the starting vid for newly added variables */
     start_vid = 0;
     nrecs = 0;  /* the current number of records */
@@ -397,12 +410,16 @@ fillerup_aggregate(NC *ncp, NC *old_ncp)
      * variables' fill modes and overwrite local's if an inconsistency is found
      * Note ncp->vars.ndefined is already made consistent by this point.
      */
-    if (ncp->nprocs > 1) {
+    MPI_Comm comm = (ncp->num_aggrs_per_node > 0) ? ncp->ina_comm : ncp->comm;
+
+    if (nprocs > 1) {
+        int mpireturn;
+
         for (i=start_vid; i<ncp->vars.ndefined; i++)
             noFill[i-start_vid] = (char)(ncp->vars.value[i]->no_fill);
 
         TRACE_COMM(MPI_Bcast)(noFill, (ncp->vars.ndefined - start_vid),
-                              MPI_BYTE, 0, ncp->comm);
+                              MPI_BYTE, 0, comm);
         if (mpireturn != MPI_SUCCESS)
             return ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast");
 
@@ -427,9 +444,9 @@ fillerup_aggregate(NC *ncp, NC *old_ncp)
     nsegs = (size_t)(ncp->vars.ndefined + ncp->vars.num_rec_vars * nrecs);
     count  = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * nsegs);
 #ifdef HAVE_MPI_LARGE_COUNT
-    offset = (MPI_Count*)   NCI_Malloc(sizeof(MPI_Count) * nsegs);
+    offset = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nsegs);
 #else
-    offset = (MPI_Aint*)   NCI_Malloc(sizeof(MPI_Aint) * nsegs);
+    offset = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * nsegs);
 #endif
 
     /* calculate each segment's offset and count */
@@ -446,19 +463,23 @@ fillerup_aggregate(NC *ncp, NC *old_ncp)
         else                  var_len = varp->dsizes[0];
 
         /* divide evenly total number of variable's elements among processes */
-        count[j] = var_len / ncp->nprocs;
-        start = count[j] * ncp->rank;
-        if (ncp->rank < var_len % ncp->nprocs) {
-            start += ncp->rank;
+        count[j] = var_len / nprocs;
+        start = count[j] * rank;
+        if (rank < var_len % nprocs) {
+            start += rank;
             count[j]++;
         }
         else
-            start += var_len % ncp->nprocs;
+            start += var_len % nprocs;
 
         /* calculate the starting file offset */
         start *= varp->xsz;
         start += varp->begin;
-        offset[j] = (MPI_Aint)start;
+#ifdef HAVE_MPI_LARGE_COUNT
+        offset[j] = (MPI_Count)start;
+#else
+        offset[j] = start;
+#endif
         if (start != offset[j]) {
             DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
             if (status == NC_NOERR) status = err;
@@ -483,19 +504,23 @@ fillerup_aggregate(NC *ncp, NC *old_ncp)
             else                  var_len = varp->dsizes[1];
 
             /* divide total number of variable's elements among all processes */
-            count[j] = var_len / ncp->nprocs;
-            start = count[j] * ncp->rank;
-            if (ncp->rank < var_len % ncp->nprocs) {
-                start += ncp->rank;
+            count[j] = var_len / nprocs;
+            start = count[j] * rank;
+            if (rank < var_len % nprocs) {
+                start += rank;
                 count[j]++;
             }
             else
-                start += var_len % ncp->nprocs;
+                start += var_len % nprocs;
 
             /* calculate the starting file offset */
             start *= varp->xsz;
             start += varp->begin + ncp->recsize * recno;
-            offset[j] = (MPI_Aint)start;
+#ifdef HAVE_MPI_LARGE_COUNT
+            offset[j] = (MPI_Count)start;
+#else
+            offset[j] = start;
+#endif
             if (start != offset[j]) {
                 DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
                 if (status == NC_NOERR) status = err;
@@ -597,53 +622,26 @@ fillerup_aggregate(NC *ncp, NC *old_ncp)
     }
     /* k is the number of valid write requests */
     NCI_Free(noFill);
-
-    if (k == 0) {
-        filetype = MPI_BYTE;
-    }
-    else {
-        /* create fileview: a list of contiguous segment for each variable */
-#ifdef HAVE_MPI_LARGE_COUNT
-        mpireturn = MPI_Type_create_hindexed_c(k, blocklengths, offset,
-                                               MPI_BYTE, &filetype);
-#else
-        mpireturn = MPI_Type_create_hindexed(k, blocklengths, offset,
-                                             MPI_BYTE, &filetype);
-#endif
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_hindexed");
-            /* return the first encountered error if there is any */
-            if (status == NC_NOERR) status = err;
-        }
-        else
-            MPI_Type_commit(&filetype);
-    }
-
-    NCI_Free(blocklengths);
     NCI_Free(count);
-    NCI_Free(offset);
 
-    /* when nprocs == 1, we keep I/O mode in independent mode at all time */
-    fh = ncp->collective_fh;
+    err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, k, offset, blocklengths);
+    status = (status == NC_NOERR) ? err : status;
 
-    TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, filetype, "native",
-                                 MPI_INFO_NULL));
-    if (k > 0) MPI_Type_free(&filetype);
-
-    bufType = MPI_BYTE;
+    buf_view.type = MPI_BYTE;
     if (buf_len > NC_MAX_INT) {
 #ifdef HAVE_MPI_LARGE_COUNT
+        int mpireturn;
+
         mpireturn = MPI_Type_contiguous_c((MPI_Count)buf_len, MPI_BYTE,
-                                          &bufType);
+                                          &buf_view.type);
         if (mpireturn != MPI_SUCCESS) {
             err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_contiguous_c");
             /* return the first encountered error if there is any */
             if (status == NC_NOERR) status = err;
-            buf_len = 0;
+            buf_view.size = 0;
         }
         else {
-            MPI_Type_commit(&bufType);
-            buf_len = 1;
+            MPI_Type_commit(&buf_view.type);
         }
 #else
         if (status == NC_NOERR)
@@ -653,39 +651,38 @@ fillerup_aggregate(NC *ncp, NC *old_ncp)
 #endif
     }
 
-    /* write to variable collectively */
-    if (ncp->nprocs > 1) {
+    MPI_Offset off=0;
 #ifdef HAVE_MPI_LARGE_COUNT
-        TRACE_IO(MPI_File_write_at_all_c, (fh, 0, buf, (MPI_Count)buf_len,
-                                           bufType, &mpistatus));
+    MPI_Offset  len=buf_len;
 #else
-        TRACE_IO(MPI_File_write_at_all, (fh, 0, buf, (int)buf_len,
-                                         bufType, &mpistatus));
+    int         len=buf_len;
 #endif
-    }
-    else {
-#ifdef HAVE_MPI_LARGE_COUNT
-        TRACE_IO(MPI_File_write_at_c, (fh, 0, buf, (MPI_Count)buf_len,
-                                       bufType, &mpistatus));
-#else
-        TRACE_IO(MPI_File_write_at, (fh, 0, buf, (int)buf_len,
-                                     bufType, &mpistatus));
-#endif
-    }
 
+    /* write buffer is contiguous */
+    buf_view.size = buf_len;
+    buf_view.count = 1;
+    buf_view.off = &off;
+    buf_view.len = &len;
+    buf_view.is_contig = 1;
+
+    /* write to variable collectively */
+    if (nprocs > 1)
+        wlen = ncmpio_file_write_at_all(ncp, 0, buf, buf_view);
+    else
+        wlen = ncmpio_file_write_at(ncp, 0, buf, buf_view);
+    if (status == NC_NOERR && wlen < 0) status = (int)wlen;
+
+// printf("%s at %d\n",__func__,__LINE__);
     NCI_Free(buf);
-    if (bufType != MPI_BYTE) MPI_Type_free(&bufType);
-    if (mpireturn != MPI_SUCCESS) {
-        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-        if (status == NC_NOERR) status = err;
-    }
+    if (buf_view.type != MPI_BYTE) MPI_Type_free(&buf_view.type);
+
+    if (blocklengths != NULL) NCI_Free(blocklengths);
+    if (offset != NULL) NCI_Free(offset);
+
+    /* reset fileview to make the entire file visible */
+    err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL);
+    status = (status == NC_NOERR) ? err : status;
 
-    TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native",
-                                 MPI_INFO_NULL));
-    if (mpireturn != MPI_SUCCESS) {
-        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-        if (status == NC_NOERR) status = err;
-    }
     return status;
 }
 
diff --git a/src/drivers/ncmpio/ncmpio_getput.m4 b/src/drivers/ncmpio/ncmpio_getput.m4
index 363701cac..9ceb80c24 100644
--- a/src/drivers/ncmpio/ncmpio_getput.m4
+++ b/src/drivers/ncmpio/ncmpio_getput.m4
@@ -44,20 +44,22 @@ dnl
 #include "ncmpio_subfile.h"
 #endif
 
+#define ALWAYS_USE_INA
+
 /* buffer layers:
 
    For write requests:
    buf   (user buffer of internal data type)
    lbuf  (contiguous buffer packed from buf based on buftype)
    cbuf  (contiguous buffer packed from lbuf based on imap)
-   xbuf  (contiguous buffer in external data type, type-casted/byte-swapped
+   xbuf  (contiguous buffer in external data type, type-cast/byte-swapped
           from cbuf, ready to be used in MPI_File_write to write to file)
 
    For read requests:
    xbuf  (contiguous buffer to be used in MPI_File_read to read from file. Its
           contents are in external data type)
-   cbuf  (contiguous buffer type-casted/byte-swapped from xbuf, its contents
-          are in internal data type)
+   cbuf  (contiguous buffer type-cast/byte-swapped from xbuf, its contents are
+          in internal data type)
    lbuf  (contiguous buffer unpacked from cbuf based on imap)
    buf   (user buffer, unpacked from lbuf based on buftype)
 
@@ -118,10 +120,18 @@ put_varm(NC               *ncp,
     void *xbuf=NULL;
     int mpireturn, err=NC_NOERR, status=NC_NOERR, buftype_is_contig;
     int el_size, need_convert=0, need_swap=0, need_swap_back_buf=0;
-    int coll_indep, xtype_is_contig=1, can_swap_in_place;
-    MPI_Offset nelems=0, bnelems=0, nbytes=0, offset=0;
-    MPI_Datatype itype, xtype=MPI_BYTE, imaptype, filetype=MPI_BYTE;
-    MPI_File fh;
+    int can_swap_in_place;
+    MPI_Offset nelems=0, bnelems=0, nbytes=0;
+    MPI_Datatype itype, imaptype;
+
+    if (varp == NULL) { /* zero-sized request */
+        itype = MPI_BYTE;
+        el_size = 0;
+        bnelems = 0;
+        nbytes = 0;
+        buftype_is_contig = 0;
+        goto err_check;
+    }
 
     /* decode buftype to obtain the followings:
      * itype:    element data type (MPI primitive type) in buftype
@@ -135,20 +145,10 @@ put_varm(NC               *ncp,
      * el_size:  byte size of itype
      * buftype_is_contig: whether buftype is contiguous
      */
-    if (varp == NULL) { /* zero-sized request */
-        itype = MPI_BYTE;
-        el_size = 0;
-        bnelems = 0;
-        nbytes = 0;
-        buftype_is_contig = 0;
-    }
-    else {
-        err = ncmpii_buftype_decode(varp->ndims, varp->xtype, count, bufcount,
-                                    buftype, &itype, &el_size, &bnelems,
-                                    &nbytes, &buftype_is_contig);
-        if (err != NC_NOERR) goto err_check;
-    }
-    xtype_is_contig = buftype_is_contig;
+    err = ncmpii_buftype_decode(varp->ndims, varp->xtype, count, bufcount,
+                                buftype, &itype, &el_size, &bnelems, &nbytes,
+                                &buftype_is_contig);
+    if (err != NC_NOERR) goto err_check;
 
     if (buftype == MPI_DATATYPE_NULL) { /* buftype and bufcount are ignored */
         bufcount = bnelems;
@@ -174,10 +174,15 @@ put_varm(NC               *ncp,
         goto err_check;
 
     /* check if type conversion and Endianness byte swap is needed */
-    if (varp != NULL) { /* non-zero-sized request */
-        need_convert = ncmpii_need_convert(ncp->format, varp->xtype, itype);
-        need_swap    = NEED_BYTE_SWAP(varp->xtype, itype);
-    }
+    need_convert = ncmpii_need_convert(ncp->format, varp->xtype, itype);
+    need_swap    = NEED_BYTE_SWAP(varp->xtype, itype);
+
+    /* check whether this is a true varm call, if yes, imaptype will be a
+     * newly created MPI derived data type, otherwise MPI_DATATYPE_NULL
+     */
+    imaptype = MPI_DATATYPE_NULL;
+    err = ncmpii_create_imaptype(varp->ndims, count, imap, itype, &imaptype);
+    if (err != NC_NOERR) goto err_check;
 
     /* check if in-place byte swap can be enabled */
     can_swap_in_place = 1;
@@ -190,25 +195,23 @@ put_varm(NC               *ncp,
         else if (! fIsSet(ncp->flags, NC_MODE_SWAP_ON)) {
             /* auto mode, as user does not explicitly enable it */
             if (nbytes <= NC_BYTE_SWAP_BUFFER_SIZE)
-                /* If write amount is small, disable in-place swap.
-                 * This is because the user buffer may be immutable. In this
-                 * case, in-place swap will cause segmentation fault. Immutable
-                 * buffers are usually small.  */
+                /* If write amount is small, disable in-place swap. This is
+                 * because the user buffer may be immutable. In this case,
+                 * in-place swap will cause segmentation fault. Immutable
+                 * buffers are usually small.
+                 */
                 can_swap_in_place = 0;
         }
     }
 
-    /* check whether this is a true varm call, if yes, imaptype will be a
-     * newly created MPI derived data type, otherwise MPI_DATATYPE_NULL
-     */
-    imaptype = MPI_DATATYPE_NULL;
-    if (varp != NULL) { /* non-zero-sized request */
-        err = ncmpii_create_imaptype(varp->ndims, count, imap, itype, &imaptype);
-        if (err != NC_NOERR) goto err_check;
-    }
-
+#ifdef ALWAYS_USE_INA
+    if (!need_convert && imaptype == MPI_DATATYPE_NULL && buftype_is_contig &&
+        (!need_swap || can_swap_in_place))
+#else
     if (!need_convert && imaptype == MPI_DATATYPE_NULL &&
-        (!need_swap || (can_swap_in_place && buftype_is_contig))) {
+        (!need_swap || (can_swap_in_place && buftype_is_contig)))
+#endif
+    {
         /* reuse buftype, bufcount, buf in later MPI file write */
         xbuf = buf;
         if (need_swap) {
@@ -216,17 +219,17 @@ put_varm(NC               *ncp,
             need_swap_back_buf = 1;
         }
     }
-    else if (varp != NULL) {
+    else {
         xbuf = NCI_Malloc((size_t)nbytes);
         if (xbuf == NULL) {
             DEBUG_ASSIGN_ERROR(err, NC_ENOMEM)
             goto err_check;
         }
         need_swap_back_buf = 0;
-        xtype_is_contig = 1;
 
-        /* pack buf to xbuf, byte-swap and type-convert on xbuf, which
-         * will later be used in MPI file write */
+        /* Pack buf to xbuf, byte-swap and type-convert on xbuf, which will
+         * later be used in MPI file write.
+         */
         err = ncmpio_pack_xbuf(ncp->format, varp, bufcount, buftype,
                                buftype_is_contig, bnelems, itype, el_size,
                                imaptype, need_convert, need_swap, nbytes, buf,
@@ -238,16 +241,14 @@ put_varm(NC               *ncp,
         }
     }
 
-    /* Set nelems and xtype which will be used in MPI read/write */
-    if (buf != xbuf && varp != NULL) {
+    /* Set nelems which will be used in MPI read/write */
+    if (buf != xbuf) {
         /* xbuf is a contiguous buffer */
-        xtype = ncmpii_nc2mpitype(varp->xtype);
         nelems = bnelems;
     }
     else {
         /* we can safely use bufcount and buftype in MPI File read/write */
         nelems = (bufcount == NC_COUNT_IGNORE) ? bnelems : bufcount;
-        xtype = buftype;
     }
 
 err_check:
@@ -263,12 +264,22 @@ err_check:
          */
         nbytes   = 0;
         nelems   = 0;
-        filetype = MPI_BYTE;
-        xtype    = MPI_BYTE;
     }
 
-    if (fIsSet(reqMode, NC_REQ_COLL) && ncp->my_aggr >= 0 && ncp->nprocs > 1) {
-        /* intra-node write aggregation must be in collective mode */
+#ifdef ALWAYS_USE_INA
+    err = ncmpio_ina_req(ncp, NC_REQ_WR, varp, start, count, stride, nbytes,
+                         xbuf);
+    if (status == NC_NOERR) status = err;
+#else
+    MPI_Offset offset=0;
+    MPI_Datatype filetype=MPI_BYTE, xtype;
+
+    /* Set xtype which will be used in MPI read/write */
+    xtype = (nbytes == 0) ? MPI_BYTE
+          : (buf != xbuf) ? ncmpii_nc2mpitype(varp->xtype) : buftype;
+
+    if (fIsSet(reqMode, NC_REQ_COLL) && ncp->num_aggrs_per_node > 0) {
+        /* intra-node aggregation must be in collective mode */
         void *wbuf = (nbytes == 0) ?  NULL : xbuf;
         err = ncmpio_intra_node_aggregation(ncp, NC_REQ_WR, varp, start, count,
                                             stride, nelems, xtype, wbuf);
@@ -297,15 +308,8 @@ err_check:
          * at a time.
          */
 
-        fh = ncp->independent_fh;
-        coll_indep = NC_REQ_INDEP;
-        if (ncp->nprocs > 1 && fIsSet(reqMode, NC_REQ_COLL)) {
-            fh = ncp->collective_fh;
-            coll_indep = NC_REQ_COLL;
-        }
-
         /* MPI_File_set_view is collective */
-        err = ncmpio_file_set_view(ncp, fh, &offset, filetype);
+        err = ncmpio_file_set_view(ncp, &offset, filetype, 0, NULL, NULL);
         if (err != NC_NOERR) {
             nelems = 0; /* skip this request */
             if (status == NC_NOERR) status = err;
@@ -316,10 +320,10 @@ err_check:
          * written to the variable defined in file. Note data stored in xbuf
          * is in the external data type, ready to be written to file.
          */
-        err = ncmpio_read_write(ncp, NC_REQ_WR, coll_indep, offset, nelems,
-                                xtype, xbuf, xtype_is_contig);
+        err = ncmpio_read_write(ncp, NC_REQ_WR, offset, nelems, xtype, xbuf);
         if (status == NC_NOERR) status = err;
     }
+#endif
 
     /* done with xbuf */
     if (xbuf != NULL && xbuf != buf) NCI_Free(xbuf);
@@ -340,7 +344,8 @@ err_check:
                 new_numrecs = start[0] + (count[0] - 1) * stride[0] + 1;
 
             /* note new_numrecs can be smaller than ncp->numrecs when this
-             * write request writes existing records */
+             * write request writes existing records
+             */
         }
 
         if (fIsSet(reqMode, NC_REQ_COLL)) {
@@ -357,8 +362,9 @@ err_check:
                     if (status == NC_NOERR) status = err;
                 }
             }
-            /* In collective mode, ncp->numrecs is always sync-ed among
-               processes */
+            /* In collective data mode, ncp->numrecs is always sync-ed among
+             * processes
+             */
             if (ncp->numrecs < max_numrecs) {
                 err = ncmpio_write_numrecs(ncp, max_numrecs);
                 if (status == NC_NOERR) status = err;
@@ -396,11 +402,19 @@ get_varm(NC               *ncp,
          int               reqMode) /* WR/RD/COLL/INDEP */
 {
     void *xbuf=NULL;
-    int err=NC_NOERR, status=NC_NOERR, coll_indep, xtype_is_contig=1;
+    int err=NC_NOERR, status=NC_NOERR;
     int el_size, buftype_is_contig, need_swap=0, need_convert=0;
-    MPI_Offset nelems=0, bnelems=0, nbytes=0, offset=0;
-    MPI_Datatype itype, xtype=MPI_BYTE, filetype=MPI_BYTE, imaptype=MPI_DATATYPE_NULL;
-    MPI_File fh;
+    MPI_Offset nelems=0, bnelems=0, nbytes=0;
+    MPI_Datatype itype, imaptype=MPI_DATATYPE_NULL;
+
+    if (varp == NULL) { /* zero-sized request */
+        itype = MPI_BYTE;
+        el_size = 0;
+        bnelems = 0;
+        nbytes = 0;
+        buftype_is_contig = 0;
+        goto err_check;
+    }
 
     /* decode buftype to see if we can use buf to read from file.
      * itype:    element data type (MPI primitive type) in buftype
@@ -415,10 +429,9 @@ get_varm(NC               *ncp,
      * buftype_is_contig: whether buftype is contiguous
      */
     err = ncmpii_buftype_decode(varp->ndims, varp->xtype, count, bufcount,
-                                buftype, &itype, &el_size, &bnelems,
-                                &nbytes, &buftype_is_contig);
+                                buftype, &itype, &el_size, &bnelems, &nbytes,
+                                &buftype_is_contig);
     if (err != NC_NOERR) goto err_check;
-    xtype_is_contig = buftype_is_contig;
 
     if (buftype == MPI_DATATYPE_NULL) { /* buftype and bufcount are ignored */
         bufcount = bnelems;
@@ -461,32 +474,36 @@ get_varm(NC               *ncp,
      * For condition 1, buftype is decoded in ncmpii_buftype_decode()
      * For condition 2, imap is checked in ncmpii_create_imaptype()
      */
+#ifdef ALWAYS_USE_INA
+    if (!need_convert && imaptype == MPI_DATATYPE_NULL &&
+        !need_swap && buftype_is_contig)
+#else
     if (!need_convert && imaptype == MPI_DATATYPE_NULL &&
-        (!need_swap || buftype_is_contig)) {
+        (!need_swap || buftype_is_contig))
+#endif
+    {
         /* reuse buftype, bufcount, buf in later MPI file read */
         xbuf = buf;
     }
     else { /* allocate xbuf for reading */
         xbuf = NCI_Malloc((size_t)nbytes);
-        xtype_is_contig = 1;
         if (xbuf == NULL) {
             DEBUG_ASSIGN_ERROR(err, NC_ENOMEM)
             goto err_check;
         }
     }
     /* Note xbuf is the buffer to be used in MPI read calls, and hence its
-     * contents are in the external type */
+     * contents are in the external type.
+     */
 
-    /* Set nelems and xtype which will be used in MPI read/write */
+    /* Set nelems which will be used in MPI read/write */
     if (buf != xbuf) {
         /* xbuf is a contiguous buffer */
         nelems = bnelems;
-        xtype = ncmpii_nc2mpitype(varp->xtype);
     }
     else {
         /* we can safely use bufcount and buftype in MPI File read/write */
         nelems = (bufcount == NC_COUNT_IGNORE) ? bnelems : bufcount;
-        xtype = buftype;
     }
 
 err_check:
@@ -496,58 +513,71 @@ err_check:
         /* for independent API, this process returns now */
         if (fIsSet(reqMode, NC_REQ_INDEP)) return err;
 
-        /* for collective API, this process needs to participate the
-         * collective I/O operations, but with zero-length request
+        /* for collective API, this process needs to participate the collective
+         * I/O operations, but with zero-length request
          */
-        filetype = MPI_BYTE;
-        xtype    = MPI_BYTE;
         nbytes   = 0;
         nelems   = 0;
     }
+
+#ifdef ALWAYS_USE_INA
+    err = ncmpio_ina_req(ncp, NC_REQ_RD, varp, start, count, stride, nbytes,
+                         xbuf);
+    if (status == NC_NOERR) status = err;
+#else
+    MPI_Offset offset=0;
+    MPI_Datatype filetype=MPI_BYTE, xtype;
+
+    /* Set xtype which will be used in MPI read/write */
+    xtype = (nbytes == 0) ? MPI_BYTE
+          : (buf != xbuf) ? ncmpii_nc2mpitype(varp->xtype) : buftype;
+
+    if (fIsSet(reqMode, NC_REQ_COLL) && ncp->num_aggrs_per_node > 0) {
+        /* intra-node aggregation must be in collective mode */
+        void *rbuf = (nbytes == 0) ?  NULL : xbuf;
+        err = ncmpio_intra_node_aggregation(ncp, NC_REQ_RD, varp, start, count,
+                                            stride, nelems, xtype, rbuf);
+        if (status == NC_NOERR) status = err;
+    }
     else {
-        /* Create the filetype for this request and calculate the beginning
-         * file offset for this request. If this request is contiguous in file,
-         * then set filetype == MPI_BYTE. Otherwise filetype will be an MPI
-         * derived data type.
+        if (nbytes > 0) {
+            /* Create the filetype for this request and calculate the beginning
+             * file offset for this request. If this request is contiguous in
+             * file, then set filetype == MPI_BYTE. Otherwise filetype will be
+             * an MPI derived data type.
+             */
+            err = ncmpio_filetype_create_vars(ncp, varp, start, count, stride,
+                                              &offset, &filetype, NULL);
+            if (err != NC_NOERR) {
+                filetype = MPI_BYTE;
+                xtype    = MPI_BYTE;
+                nbytes   = 0;
+                nelems   = 0;
+                if (status == NC_NOERR) status = err;
+            }
+        }
+
+        /* TODO: if record variables are too big (so big that we cannot store
+         * the stride between records in an MPI_Aint, for example) then we will
+         * have to process this one record at a time.
          */
-        err = ncmpio_filetype_create_vars(ncp, varp, start, count, stride,
-                                          &offset, &filetype, NULL);
+
+        /* MPI_File_set_view is collective */
+        err = ncmpio_file_set_view(ncp, &offset, filetype, 0, NULL, NULL);
         if (err != NC_NOERR) {
-            filetype = MPI_BYTE;
-            xtype    = MPI_BYTE;
-            nbytes   = 0;
-            nelems   = 0;
+            nelems = 0; /* skip this request */
             if (status == NC_NOERR) status = err;
         }
-    }
-
-    /* TODO: if record variables are too big (so big that we cannot store the
-     * stride between records in an MPI_Aint, for example) then we will
-     * have to process this one record at a time.
-     */
-
-    fh = ncp->independent_fh;
-    coll_indep = NC_REQ_INDEP;
-    if (ncp->nprocs > 1 && fIsSet(reqMode, NC_REQ_COLL)) {
-        fh = ncp->collective_fh;
-        coll_indep = NC_REQ_COLL;
-    }
+        if (filetype != MPI_BYTE) MPI_Type_free(&filetype);
 
-    /* MPI_File_set_view is collective */
-    err = ncmpio_file_set_view(ncp, fh, &offset, filetype);
-    if (err != NC_NOERR) {
-        nelems = 0; /* skip this request */
+        /* xtype is the element data type (MPI primitive type) in xbuf to be
+         * read from the variable defined in file. Note xbuf will contain data
+         * read from the file and hence is in the external data type.
+         */
+        err = ncmpio_read_write(ncp, NC_REQ_RD, offset, nelems, xtype, xbuf);
         if (status == NC_NOERR) status = err;
     }
-    if (filetype != MPI_BYTE) MPI_Type_free(&filetype);
-
-    /* xtype is the element data type (MPI primitive type) in xbuf to be
-     * read from the variable defined in file. Note xbuf will contain data read
-     * from the file and hence is in the external data type.
-     */
-    err = ncmpio_read_write(ncp, NC_REQ_RD, coll_indep, offset, nelems, xtype,
-                            xbuf, xtype_is_contig);
-    if (status == NC_NOERR) status = err;
+#endif
 
     if (nelems > 0) {
         /* unpack xbuf into user buffer, buf */
@@ -608,15 +638,22 @@ ncmpio_$1_var(void             *ncdp,
          * write, they still need to participate the communication part of the
          * intra-node aggregation operation.
          */
-        ifelse(`$1',`put',`if (ncp->my_aggr >= 0)
-            return $1_varm(ncp, NULL, NULL, NULL, NULL, imap, NULL, 0, buftype, reqMode);')
+#ifdef ALWAYS_USE_INA
+        return $1_varm(ncp, NULL, NULL, NULL, NULL, imap, NULL, 0,
+                       buftype, reqMode);
+#else
+        if (ncp->num_aggrs_per_node > 0)
+            return $1_varm(ncp, NULL, NULL, NULL, NULL, imap, NULL, 0,
+                           buftype, reqMode);
 
         /* this collective API has a zero-length request */
         return ncmpio_getput_zero_req(ncp, reqMode);
+#endif
     }
 
     /* obtain NC_var object pointer, varp. Note sanity check for ncdp and
-     * varid has been done in dispatchers */
+     * varid has been done in dispatchers
+     */
     varp = ncp->vars.value[varid];
 
 #ifdef ENABLE_SUBFILING
diff --git a/src/drivers/ncmpio/ncmpio_header_get.c b/src/drivers/ncmpio/ncmpio_header_get.c
index 6ddd89bc1..b1ae28cc4 100644
--- a/src/drivers/ncmpio/ncmpio_header_get.c
+++ b/src/drivers/ncmpio/ncmpio_header_get.c
@@ -316,103 +316,101 @@ hdr_len_NC_vararray(const NC_vararray *ncap,
 
 /*----< hdr_fetch() >--------------------------------------------------------*/
 /* Fetch the next header chunk. The chunk buffer, pointed by gbp->base, is of
- * size 'gbp->chunk' bytes. Be careful not to overwrite leftover (yet to be
- * used) data in the buffer before fetching a new chunk.
+ * size 'gbp->ncp->chunk' bytes. Be careful not to overwrite leftover (yet to
+ * be used) data in the buffer before fetching a new chunk.
  */
 static int
 hdr_fetch(bufferinfo *gbp) {
-    char *mpi_name;
     int rank, nprocs, err=NC_NOERR, mpireturn;
-    MPI_Status mpistatus;
+    PNCIO_View buf_view;
 
     assert(gbp->base != NULL);
 
-    MPI_Comm_size(gbp->comm, &nprocs);
-    MPI_Comm_rank(gbp->comm, &rank);
+    buf_view.count = 0;
+    buf_view.off = NULL;
+    buf_view.len = NULL;
+    buf_view.is_contig = 1;
+    buf_view.type = MPI_BYTE;
+
+    MPI_Comm_size(gbp->ncp->comm, &nprocs);
+    MPI_Comm_rank(gbp->ncp->comm, &rank);
     if (rank == 0) {
         char *readBuf;
         int readLen;
         size_t slack;
+        MPI_Offset rlen;
 
         /* any leftover data in the buffer */
-        slack = gbp->chunk - (gbp->pos - gbp->base);
-        if (slack == gbp->chunk) slack = 0;
+        slack = gbp->ncp->chunk - (gbp->pos - gbp->base);
+        if (slack == gbp->ncp->chunk) slack = 0;
 
-        /* When gbp->chunk == (gbp->pos - gbp->base), all data in the buffer has
-         * been consumed. If not, then read additional header of size
-         * (gbp->chunk - slack) into a contiguous buffer, pointed by gbp->base +
-         * slack.
+        /* When gbp->ncp->chunk == (gbp->pos - gbp->base), all data in the
+         * buffer has been consumed. If not, then read additional header of
+         * size (gbp->ncp->chunk - slack) into a contiguous buffer, pointed by
+         * gbp->base + slack.
          */
 
         readBuf = gbp->base;
-        readLen = gbp->chunk;
+        readLen = gbp->ncp->chunk;
         if (slack > 0) { /* move slack to beginning of the buffer, gbp->base */
             memmove(gbp->base, gbp->pos, slack);
             readBuf += slack;
             readLen -= slack;
         }
 
-        /* explicitly initialize mpistatus object to 0. For zero-length read,
-         * MPI_Get_count may report incorrect result for some MPICH version,
-         * due to the uninitialized MPI_Status object passed to MPI-IO calls.
-         */
-        memset(&mpistatus, 0, sizeof(MPI_Status));
+        buf_view.size = readLen;
 
         /* fileview is already entire file visible and MPI_File_read_at does
            not change the file pointer */
-        if (gbp->coll_mode == 1) { /* collective read */
-            TRACE_IO(MPI_File_read_at_all, (gbp->collective_fh, gbp->offset, readBuf,
-                                            readLen, MPI_BYTE, &mpistatus));
-        }
-        else {
-            TRACE_IO(MPI_File_read_at, (gbp->collective_fh, gbp->offset, readBuf,
-                                        readLen, MPI_BYTE, &mpistatus));
-        }
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-            if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD)
-        }
-        else {
-            /* Obtain the actual read amount. It may be smaller than readLen,
-             * when the remaining file size is smaller than read chunk size.
-             * Because each MPI File_read reads amount of readLen bytes, and
-             * readLen <= read chunk size which is <= NC_MAX_INT, calling
-             * MPI_Get_count() is sufficient. No need to call MPI_Get_count_c()
-             */
-            int get_size;
-            MPI_Get_count(&mpistatus, MPI_BYTE, &get_size);
-            gbp->get_size += get_size;
-
-            /* If actual read amount is shorter than readLen, then we zero-out
-             * the remaining buffer. This is because the MPI_Bcast below
-             * broadcasts a buffer of a fixed size, gbp->chunk. Without zeroing
-             * out, valgrind will complain about the uninitialized values.
+        if (gbp->ncp->nprocs > 1 && fIsSet(gbp->ncp->flags, NC_HCOLL))
+            /* collective read */
+            rlen = ncmpio_file_read_at_all(gbp->ncp, gbp->offset, readBuf,
+                                           buf_view);
+        else
+            /* independent read */
+            rlen = ncmpio_file_read_at(gbp->ncp, gbp->offset, readBuf,
+                                       buf_view);
+
+        if (rlen > 0) {
+            /* rlen is the actual read amount. It may be smaller than readLen,
+             * when the remaining file size is smaller than readLen. When
+             * actual read amount is smaller than readLen, then we zero-out the
+             * remaining buffer. This is because the MPI_Bcast below broadcasts
+             * a buffer of a fixed size, gbp->ncp->chunk. Without zeroing out,
+             * valgrind will complain about the uninitialized values.
              */
-            if (get_size < readLen)
-                memset(readBuf + get_size, 0, readLen - get_size);
+            if (rlen < readLen)
+                memset(readBuf + rlen, 0, readLen - rlen);
         }
+        else if (rlen < 0)
+            err = (int)rlen;
+
         /* only root process reads file header, keeps track of current read
          * file pointer location */
-        gbp->offset += readLen;
+        gbp->offset += rlen;
     }
-    else if (gbp->coll_mode == 1) { /* collective read */
-        /* other processes participate the collective call */
-        TRACE_IO(MPI_File_read_at_all, (gbp->collective_fh, 0, NULL,
-                                        0, MPI_BYTE, &mpistatus));
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-            if (err == NC_EFILE) DEBUG_ASSIGN_ERROR(err, NC_EREAD)
-        }
+    else if (gbp->ncp->nprocs > 1 && fIsSet(gbp->ncp->flags, NC_HCOLL)) {
+        /* Collective read: non-root ranks participate the collective call with
+         * a zero-sized request.
+         */
+        buf_view.size = 0;
+        ncmpio_file_read_at_all(gbp->ncp, 0, NULL, buf_view);
     }
 
-    if (gbp->safe_mode == 1 && nprocs > 1) {
-        TRACE_COMM(MPI_Bcast)(&err, 1, MPI_INT, 0, gbp->comm);
+    if (gbp->ncp->safe_mode == 1 && nprocs > 1) {
+        TRACE_COMM(MPI_Bcast)(&err, 1, MPI_INT, 0, gbp->ncp->comm);
+        if (mpireturn != MPI_SUCCESS)
+            return ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast");
         if (err != NC_NOERR) return err;
     }
 
     /* broadcast root's read (full or partial header) to other processes */
-    if (nprocs > 1)
-        TRACE_COMM(MPI_Bcast)(gbp->base, gbp->chunk, MPI_BYTE, 0, gbp->comm);
+    if (nprocs > 1) {
+        TRACE_COMM(MPI_Bcast)(gbp->base, gbp->ncp->chunk, MPI_BYTE, 0,
+                              gbp->ncp->comm);
+        if (mpireturn != MPI_SUCCESS)
+            return ncmpii_error_mpi2nc(mpireturn, "MPI_Bcast");
+    }
 
     gbp->pos = gbp->base;
 
@@ -503,7 +501,7 @@ hdr_get_nc_type(bufferinfo *gbp, nc_type *xtypep)
     if (xtype < NC_BYTE)
         DEBUG_RETURN_ERROR(NC_EBADTYPE)
 
-    if (gbp->version < 5) {
+    if (gbp->ncp->format < 5) {
         if (xtype > NC_DOUBLE)
             DEBUG_RETURN_ERROR(NC_EBADTYPE)
     }
@@ -536,7 +534,7 @@ hdr_get_NC_name(bufferinfo *gbp, char **namep, size_t *name_len)
     *namep = NULL;
 
     /* get nelems (string length of name) */
-    if (gbp->version < 5) {
+    if (gbp->ncp->format < 5) {
         uint tmp;
         err = hdr_get_uint32(gbp, &tmp);
         if (err != NC_NOERR) return err;
@@ -564,7 +562,7 @@ hdr_get_NC_name(bufferinfo *gbp, char **namep, size_t *name_len)
     */
     padding = PNETCDF_RNDUP(nchars, X_ALIGN) - nchars;
 
-    bufremain = gbp->chunk - (gbp->pos - gbp->base);
+    bufremain = gbp->ncp->chunk - (gbp->pos - gbp->base);
 
     cpos = *namep;
 
@@ -585,7 +583,7 @@ hdr_get_NC_name(bufferinfo *gbp, char **namep, size_t *name_len)
                 *namep = NULL;
                 return err;
             }
-            bufremain = gbp->chunk;
+            bufremain = gbp->ncp->chunk;
         }
     }
 
@@ -659,7 +657,7 @@ hdr_get_NC_dim(bufferinfo *gbp, int unlimited_id, NC_dim **dimpp)
     else if (err != NC_NOERR) return err;
 
     /* get dim_length */
-    if (gbp->version < 5) {
+    if (gbp->ncp->format < 5) {
         uint tmp;
         err = hdr_get_uint32(gbp, &tmp);
         dim_length = (MPI_Offset)tmp;
@@ -730,7 +728,7 @@ hdr_get_NC_dimarray(bufferinfo *gbp, NC_dimarray *ncap)
     if (err != NC_NOERR) return err;
 
     /* read nelems (number of dimensions) from gbp buffer */
-    if (gbp->version < 5) { /* nelems is <non-negative INT> */
+    if (gbp->ncp->format < 5) { /* nelems is <non-negative INT> */
         uint tmp;
         err = hdr_get_uint32(gbp, &tmp);
         if (err != NC_NOERR) return err;
@@ -809,8 +807,8 @@ hdr_get_NC_attrV(bufferinfo *gbp, NC_attr *attrp)
     nbytes = attrp->nelems * xsz;
     padding = attrp->xsz - nbytes;
 
-    bufremain = gbp->chunk - (gbp->pos - gbp->base);
-    /* gbp->chunk is the read chunk size, which is of type 4-byte int.
+    bufremain = gbp->ncp->chunk - (gbp->pos - gbp->base);
+    /* gbp->ncp->chunk is the read chunk size, which is of type 4-byte int.
      * thus bufremain should be less than INT_MAX */
 
     /* get values */
@@ -823,10 +821,9 @@ hdr_get_NC_attrV(bufferinfo *gbp, NC_attr *attrp)
             value = (void *)((char *)value + attcount);
             bufremain -= attcount;
         } else {
-            int err;
             err = hdr_fetch(gbp);
             if (err != NC_NOERR) return err;
-            bufremain = gbp->chunk;
+            bufremain = gbp->ncp->chunk;
         }
     }
 
@@ -906,7 +903,7 @@ hdr_get_NC_attr(bufferinfo *gbp, NC_attr **attrpp)
     }
 
     /* get nelems */
-    if (gbp->version < 5) {
+    if (gbp->ncp->format < 5) {
         uint tmp;
         err = hdr_get_uint32(gbp, &tmp);
         nelems = (MPI_Offset)tmp;
@@ -977,7 +974,7 @@ hdr_get_NC_attrarray(bufferinfo *gbp, NC_attrarray *ncap)
     if (err != NC_NOERR) return err;
 
     /* read nelems (number of attributes) from gbp buffer */
-    if (gbp->version < 5) { /* nelems is <non-negative INT> */
+    if (gbp->ncp->format < 5) { /* nelems is <non-negative INT> */
         uint tmp;
         err = hdr_get_uint32(gbp, &tmp);
         if (err != NC_NOERR) return err;
@@ -1061,7 +1058,7 @@ hdr_get_NC_var(bufferinfo  *gbp,
     else if (err != NC_NOERR) return err;
 
     /* nelems (number of dimensions) */
-    if (gbp->version < 5) {
+    if (gbp->ncp->format < 5) {
         uint tmp;
         err = hdr_get_uint32(gbp, &tmp);
         if (err != NC_NOERR) {
@@ -1099,7 +1096,7 @@ hdr_get_NC_var(bufferinfo  *gbp,
 
     /* get [dimid ...] */
     for (dim=0; dim<ndims; dim++) {
-        if (gbp->version < 5) {
+        if (gbp->ncp->format < 5) {
             uint tmp;
             err = hdr_get_uint32(gbp, &tmp);
             if (err != NC_NOERR) break;
@@ -1135,7 +1132,7 @@ hdr_get_NC_var(bufferinfo  *gbp,
     ncmpii_xlen_nc_type(varp->xtype, &varp->xsz);
 
     /* get vsize */
-    if (gbp->version < 5) {
+    if (gbp->ncp->format < 5) {
         uint tmp;
         err = hdr_get_uint32(gbp, &tmp);
         varp->len = (MPI_Offset)tmp;
@@ -1164,7 +1161,7 @@ hdr_get_NC_var(bufferinfo  *gbp,
      */
 
     /* get begin */
-    if (gbp->version == 1) {
+    if (gbp->ncp->format == 1) {
         uint tmp;
         err = hdr_get_uint32(gbp, &tmp);
         varp->begin = (MPI_Offset)tmp;
@@ -1223,7 +1220,7 @@ hdr_get_NC_vararray(bufferinfo  *gbp,
     if (err != NC_NOERR) return err;
 
     /* read nelems (number of variables) from gbp buffer */
-    if (gbp->version < 5) { /* nelems is <non-negative INT> */
+    if (gbp->ncp->format < 5) { /* nelems is <non-negative INT> */
         uint tmp;
         err = hdr_get_uint32(gbp, &tmp);
         if (err != NC_NOERR) return err;
@@ -1339,24 +1336,13 @@ ncmpio_hdr_get_NC(NC *ncp)
     assert(ncp != NULL);
 
     /* Initialize the get buffer that stores the header read from the file */
-    getbuf.comm          = ncp->comm;
-    getbuf.collective_fh = ncp->collective_fh;
-    getbuf.get_size      = 0;
-    getbuf.offset        = 0;   /* read from start of the file */
-    getbuf.safe_mode     = ncp->safe_mode;
-    if (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL))
-        getbuf.coll_mode = 1;
-    else
-        getbuf.coll_mode = 0;
-
-    /* CDF-5's minimum header size is 4 bytes more than CDF-1 and CDF-2's */
-    getbuf.chunk = PNETCDF_RNDUP( MAX(MIN_NC_XSZ+4, ncp->chunk), X_ALIGN );
+    getbuf.ncp    = ncp;
+    getbuf.offset = 0;   /* read from start of the file */
+    getbuf.base   = (char*) NCI_Malloc(getbuf.ncp->chunk);
+    getbuf.pos    = getbuf.base;
+    getbuf.end    = getbuf.base + getbuf.ncp->chunk;
 
-    getbuf.base = (char*) NCI_Malloc(getbuf.chunk);
-    getbuf.pos  = getbuf.base;
-    getbuf.end  = getbuf.base + getbuf.chunk;
-
-    /* Fetch the next header chunk. The chunk is 'gbp->chunk' bytes big */
+    /* Fetch the next header chunk. The chunk is 'gbp->ncp->chunk' bytes big */
     err = hdr_fetch(&getbuf);
     if (err != NC_NOERR) return err;
 
@@ -1381,20 +1367,20 @@ ncmpio_hdr_get_NC(NC *ncp)
         goto fn_exit;
     }
 
-    /* check version number in last byte of magic */
-    if (magic[3] == 0x1) {
-        getbuf.version = ncp->format = 1;
-    } else if (magic[3] == 0x2) {
-        getbuf.version = ncp->format = 2;
-    } else if (magic[3] == 0x5) {
-        getbuf.version = ncp->format = 5;
-    } else {
+    /* check format version number in last byte of magic */
+    if (magic[3] == 0x1)
+        ncp->format = 1;
+    else if (magic[3] == 0x2)
+        ncp->format = 2;
+    else if (magic[3] == 0x5)
+        ncp->format = 5;
+    else {
         NCI_Free(getbuf.base);
         DEBUG_RETURN_ERROR(NC_ENOTNC) /* not a netCDF file */
     }
 
     /* get numrecs from getbuf into ncp */
-    if (getbuf.version < 5) {
+    if (getbuf.ncp->format < 5) {
         uint tmp=0;
         err = hdr_get_uint32(&getbuf, &tmp);
         if (err != NC_NOERR) goto fn_exit;
@@ -1449,7 +1435,6 @@ ncmpio_hdr_get_NC(NC *ncp)
     if (err != NC_NOERR) goto fn_exit;
 
 fn_exit:
-    ncp->get_size += getbuf.get_size;
     NCI_Free(getbuf.base);
 
     return (err == NC_NOERR) ? status : err;
diff --git a/src/drivers/ncmpio/ncmpio_header_put.c b/src/drivers/ncmpio/ncmpio_header_put.c
index 8daf88c67..387e9cfba 100644
--- a/src/drivers/ncmpio/ncmpio_header_put.c
+++ b/src/drivers/ncmpio/ncmpio_header_put.c
@@ -49,7 +49,7 @@ hdr_put_NC_name(bufferinfo *pbp,
     size_t nchars = strlen(name);
 
     /* copy nelems */
-    if (pbp->version < 5)
+    if (pbp->ncp->format < 5)
         err = ncmpix_put_uint32((void**)(&pbp->pos), (uint)nchars);
     else
         err = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)nchars);
@@ -78,7 +78,7 @@ hdr_put_NC_dim(bufferinfo   *pbp,
     if (err != NC_NOERR) return err;
 
     /* copy dim_length */
-    if (pbp->version < 5) {
+    if (pbp->ncp->format < 5) {
         /* TODO: Isn't checking dimension size already done in def_dim()? */
         if (dimp->size > NC_MAX_INT)
             DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
@@ -116,7 +116,7 @@ hdr_put_NC_dimarray(bufferinfo        *pbp,
         if (status != NC_NOERR) return status;
 
         /* put a ZERO or ZERO64 depending on which CDF format */
-        if (pbp->version < 5)
+        if (pbp->ncp->format < 5)
             status = ncmpix_put_uint32((void**)(&pbp->pos), 0);
         else
             status = ncmpix_put_uint64((void**)(&pbp->pos), 0);
@@ -128,7 +128,7 @@ hdr_put_NC_dimarray(bufferinfo        *pbp,
         if (status != NC_NOERR) return status;
 
         /* copy nelems */
-        if (pbp->version < 5)
+        if (pbp->ncp->format < 5)
             status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)ncap->ndefined);
         else
             status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)ncap->ndefined);
@@ -175,7 +175,7 @@ hdr_put_NC_attrV(bufferinfo    *pbp,
     sz = attrp->nelems * xsz;
     padding = attrp->xsz - sz;
 
-    if (pbp->version < 5 && sz > NC_MAX_INT)
+    if (pbp->ncp->format < 5 && sz > NC_MAX_INT)
         DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
 
     memcpy(pbp->pos, attrp->xvalue, (size_t)sz);
@@ -214,7 +214,7 @@ hdr_put_NC_attr(bufferinfo    *pbp,
     if (status != NC_NOERR) return status;
 
     /* copy nelems */
-    if (pbp->version < 5) {
+    if (pbp->ncp->format < 5) {
         if (attrp->nelems  > NC_MAX_INT)
             DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
         status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)attrp->nelems);
@@ -258,7 +258,7 @@ hdr_put_NC_attrarray(bufferinfo         *pbp,
         if (status != NC_NOERR) return status;
 
         /* put a ZERO or ZERO64 depending on which CDF format */
-        if (pbp->version < 5)
+        if (pbp->ncp->format < 5)
             status = ncmpix_put_uint32((void**)(&pbp->pos), 0);
         else
             status = ncmpix_put_uint64((void**)(&pbp->pos), 0);
@@ -270,7 +270,7 @@ hdr_put_NC_attrarray(bufferinfo         *pbp,
         if (status != NC_NOERR) return status;
 
         /* copy nelems */
-        if (pbp->version < 5)
+        if (pbp->ncp->format < 5)
             status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)ncap->ndefined);
         else
             status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)ncap->ndefined);
@@ -314,7 +314,7 @@ hdr_put_NC_var(bufferinfo   *pbp,
     if (status != NC_NOERR) return status;
 
     /* copy nelems */
-    if (pbp->version < 5)
+    if (pbp->ncp->format < 5)
         status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)varp->ndims);
     else
         status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)varp->ndims);
@@ -322,7 +322,7 @@ hdr_put_NC_var(bufferinfo   *pbp,
 
     /* copy [dimid ...] */
     for (i=0; i<varp->ndims; i++) {
-        if (pbp->version < 5)
+        if (pbp->ncp->format < 5)
             status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)varp->dimids[i]);
         else
             status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)varp->dimids[i]);
@@ -341,7 +341,7 @@ hdr_put_NC_var(bufferinfo   *pbp,
     /* in CDF-1 and CDF-2, a variable's size in the header is a 32-bit integer
      * in CDF-5, it is a 64-bit integer
      */
-    if (pbp->version < 5) {
+    if (pbp->ncp->format < 5) {
         /* Special case, when there is no record variable, the last fixed-size
          * variable can be larger than 2 GiB if its file starting offset is
          * less than 2 GiB. This checking has already been done in the call
@@ -367,7 +367,7 @@ hdr_put_NC_var(bufferinfo   *pbp,
     /* in CDF-1 header, a variable's starting file offset is a 32-bit integer
      * in CDF-2 and CDF-5, it is a 64-bit integer
      */
-    if (pbp->version == 1) {
+    if (pbp->ncp->format == 1) {
         if (varp->begin  > NC_MAX_INT)
             DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
         status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)varp->begin);
@@ -407,7 +407,7 @@ hdr_put_NC_vararray(bufferinfo        *pbp,
         if (status != NC_NOERR) return status;
 
         /* put a ZERO or ZERO64 depending on which CDF format */
-        if (pbp->version < 5)
+        if (pbp->ncp->format < 5)
             status = ncmpix_put_uint32((void**)(&pbp->pos), 0);
         else
             status = ncmpix_put_uint64((void**)(&pbp->pos), 0);
@@ -419,7 +419,7 @@ hdr_put_NC_vararray(bufferinfo        *pbp,
         if (status != NC_NOERR) return status;
 
         /* copy nelems */
-        if (pbp->version < 5)
+        if (pbp->ncp->format < 5)
             status = ncmpix_put_uint32((void**)(&pbp->pos), (uint)ncap->ndefined);
         else
             status = ncmpix_put_uint64((void**)(&pbp->pos), (uint64)ncap->ndefined);
@@ -441,20 +441,14 @@ hdr_put_NC_vararray(bufferinfo        *pbp,
 int
 ncmpio_hdr_put_NC(NC *ncp, void *buf)
 {
-    int status;
+    int err;
     bufferinfo putbuf;
     MPI_Offset nrecs=0;
 
-    putbuf.comm          = ncp->comm;
-    putbuf.collective_fh = ncp->collective_fh;
-    putbuf.offset        = 0;
-    putbuf.pos           = buf;
-    putbuf.base          = buf;
-    putbuf.safe_mode     = ncp->safe_mode;
-    if (ncp->nprocs > 1 && fIsSet(ncp->flags, NC_HCOLL))
-        putbuf.coll_mode = 1;
-    else
-        putbuf.coll_mode = 0;
+    putbuf.ncp    = ncp;
+    putbuf.offset = 0;
+    putbuf.pos    = buf;
+    putbuf.base   = buf;
 
     /* netCDF file format:
      * netcdf_file  = header  data
@@ -462,43 +456,37 @@ ncmpio_hdr_put_NC(NC *ncp, void *buf)
      */
 
     /* copy "magic", 4 characters */
-    if (ncp->format == 5) {
-        putbuf.version = 5;
-        status = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic5), ncmagic5);
-    }
-    else if (ncp->format == 2) {
-        putbuf.version = 2;
-        status = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic2), ncmagic2);
-    }
-    else {
-        putbuf.version = 1;
-        status = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic1), ncmagic1);
-    }
-    if (status != NC_NOERR) return status;
+    if (ncp->format == 5)
+        err = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic5), ncmagic5);
+    else if (ncp->format == 2)
+        err = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic2), ncmagic2);
+    else
+        err = ncmpix_putn_text((void **)(&putbuf.pos), sizeof(ncmagic1), ncmagic1);
+    if (err != NC_NOERR) return err;
 
     /* copy numrecs, number of records */
     nrecs = ncp->numrecs;
     if (ncp->format < 5) {
         if (nrecs  > NC_MAX_INT)
             DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
-        status = ncmpix_put_uint32((void**)(&putbuf.pos), (uint)nrecs);
+        err = ncmpix_put_uint32((void**)(&putbuf.pos), (uint)nrecs);
     }
     else {
-        status = ncmpix_put_uint64((void**)(&putbuf.pos), (uint64)nrecs);
+        err = ncmpix_put_uint64((void**)(&putbuf.pos), (uint64)nrecs);
     }
-    if (status != NC_NOERR) return status;
+    if (err != NC_NOERR) return err;
 
     /* copy dim_list */
-    status = hdr_put_NC_dimarray(&putbuf, &ncp->dims);
-    if (status != NC_NOERR) return status;
+    err = hdr_put_NC_dimarray(&putbuf, &ncp->dims);
+    if (err != NC_NOERR) return err;
 
     /* copy gatt_list */
-    status = hdr_put_NC_attrarray(&putbuf, &ncp->attrs);
-    if (status != NC_NOERR) return status;
+    err = hdr_put_NC_attrarray(&putbuf, &ncp->attrs);
+    if (err != NC_NOERR) return err;
 
     /* copy var_list */
-    status = hdr_put_NC_vararray(&putbuf, &ncp->vars);
-    if (status != NC_NOERR) return status;
+    err = hdr_put_NC_vararray(&putbuf, &ncp->vars);
+    if (err != NC_NOERR) return err;
 
     return NC_NOERR;
 }
@@ -514,11 +502,12 @@ ncmpio_hdr_put_NC(NC *ncp, void *buf)
  */
 int ncmpio_write_header(NC *ncp)
 {
-    char *mpi_name;
-    int status=NC_NOERR, mpireturn, err;
+    int status=NC_NOERR, mpireturn;
     size_t i, ntimes;
-    MPI_File fh;
-    MPI_Status mpistatus;
+    PNCIO_View buf_view;
+
+    buf_view.count = 1;
+    buf_view.is_contig = 1;
 
     /* Write the entire header to the file. This function may be called from
      * a rename API. In that case, we cannot just change the variable name in
@@ -526,10 +515,6 @@ int ncmpio_write_header(NC *ncp)
      * all metadata following the new name must be moved ahead.
      */
 
-    fh = ncp->collective_fh;
-    if (NC_indep(ncp)) /* called in independent data mode */
-        fh = ncp->independent_fh;
-
     /* update file header size, as this subroutine may be called from a rename
      * API (var or attribute) and the new name is smaller/bigger which changes
      * the header size. We recalculate ncp->xsz by getting the un-aligned size
@@ -555,42 +540,17 @@ int ncmpio_write_header(NC *ncp)
         buf_ptr = buf;
         for (i=0; i<ntimes; i++) {
             int writeLen = (int)(MIN(remain, NC_MAX_INT));
+            MPI_Offset wlen;
+
+            buf_view.type = MPI_BYTE;
+            buf_view.size = writeLen;
+
+            if (fIsSet(ncp->flags, NC_HCOLL)) /* header collective write */
+                wlen = ncmpio_file_write_at_all(ncp, offset, buf_ptr, buf_view);
+            else /* header independent write */
+                wlen = ncmpio_file_write_at(ncp, offset, buf_ptr, buf_view);
+            if (status == NC_NOERR && wlen < 0) status = (int)wlen;
 
-            /* explicitly initialize mpistatus object to 0. For zero-length
-             * read, MPI_Get_count may report incorrect result for some MPICH
-             * version, due to the uninitialized MPI_Status object passed to
-             * MPI-IO calls.  Thus we initialize it above to work around.
-             */
-            memset(&mpistatus, 0, sizeof(MPI_Status));
-
-            if (fIsSet(ncp->flags, NC_HCOLL)) { /* header collective write */
-                TRACE_IO(MPI_File_write_at_all, (fh, offset, buf_ptr, writeLen,
-                                                 MPI_BYTE, &mpistatus));
-            }
-            else { /* header independent write */
-                TRACE_IO(MPI_File_write_at, (fh, offset, buf_ptr, writeLen,
-                                             MPI_BYTE, &mpistatus));
-            }
-            if (mpireturn != MPI_SUCCESS) {
-                err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-                if (status == NC_NOERR) {
-                    err = (err == NC_EFILE) ? NC_EWRITE : err;
-                    DEBUG_ASSIGN_ERROR(status, err)
-                }
-            }
-            else {
-                /* update the number of bytes written since file open.
-                 * Because each MPI write writes no more than NC_MAX_INT,
-                 * calling MPI_Get_count() is sufficient. No need to call
-                 * MPI_Get_count_c()
-                 */
-                int put_size;
-                mpireturn = MPI_Get_count(&mpistatus, MPI_BYTE, &put_size);
-                if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED)
-                    ncp->put_size += ncp->xsz;
-                else
-                    ncp->put_size += writeLen;
-            }
             offset  += writeLen;
             buf_ptr += writeLen;
             remain  -= writeLen;
@@ -598,10 +558,9 @@ int ncmpio_write_header(NC *ncp)
         NCI_Free(buf);
     }
     else if (fIsSet(ncp->flags, NC_HCOLL)) { /* header collective write */
-        /* collective write: other processes participate the collective call */
-        for (i=0; i<ntimes; i++) {
-            TRACE_IO(MPI_File_write_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus));
-        }
+        /* collective write: non-root ranks participate the collective call */
+        buf_view.size = 0;
+        ncmpio_file_write_at_all(ncp, 0, NULL, buf_view);
     }
 
     if (ncp->safe_mode == 1) {
diff --git a/src/drivers/ncmpio/ncmpio_i_getput.m4 b/src/drivers/ncmpio/ncmpio_i_getput.m4
index 7f624207d..b5b4f09db 100644
--- a/src/drivers/ncmpio/ncmpio_i_getput.m4
+++ b/src/drivers/ncmpio/ncmpio_i_getput.m4
@@ -122,6 +122,11 @@ ncmpio_add_record_requests(NC_lead_req      *lead_list,
         reqs[i].lead_off  = reqs[0].lead_off;
         reqs[i].xbuf      = xbuf;
         xbuf             += rec_bufsize;
+
+        /* copy the number of flattened offset-length pairs */
+        reqs[i].npairs       = reqs[0].npairs;
+        reqs[i].offset_start = reqs[0].offset_start;
+        reqs[i].offset_end   = reqs[0].offset_end;
     }
 
     return NC_NOERR;
@@ -142,7 +147,7 @@ ncmpio_igetput_varm(NC               *ncp,
                     int               reqMode)
 {
     void *xbuf=NULL;
-    int i, err=NC_NOERR, abuf_index=-1, isize, xsize, new_nreqs, rem;
+    int i, j, err=NC_NOERR, abuf_index=-1, isize, xsize, new_nreqs, rem;
     int mpireturn, buftype_is_contig=1, need_convert, free_xbuf=0;
     int need_swap, can_swap_in_place, need_swap_back_buf=0;
     MPI_Offset nelems=0, nbytes, *ptr;
@@ -520,9 +525,13 @@ ncmpio_igetput_varm(NC               *ncp,
     }
 
     /* allocate a single array for non-leads to store start/count/stride */
+    req->npairs = 0;
     if (varp->ndims == 0) { /* scalar variable, start may be NULL */
         lead_req->start = NULL;
         req->start = NULL;
+        req->npairs = 1;
+        req->offset_start = 0; /* relative to var's begin */
+        req->offset_end = varp->xsz;
     }
     else if (stride == NULL) {
         size_t memChunk = varp->ndims * SIZEOF_MPI_OFFSET;
@@ -536,6 +545,12 @@ ncmpio_igetput_varm(NC               *ncp,
         memcpy(ptr, start, memChunk);
         ptr += varp->ndims;
         memcpy(ptr, count, memChunk);
+
+        /* calculate number of flattened offset-length pairs */
+        req->npairs = 1;
+        j = IS_RECVAR(varp) ? 1 : 0;
+        for (i=j; i<varp->ndims-1; i++)
+            req->npairs *= count[i];
     }
     else {
         size_t memChunk = varp->ndims * SIZEOF_MPI_OFFSET;
@@ -551,12 +566,24 @@ ncmpio_igetput_varm(NC               *ncp,
         memcpy(ptr, count, memChunk);
         ptr += varp->ndims;
         memcpy(ptr, stride, memChunk);
+
+        /* calculate number of flattened offset-length pairs */
+        req->npairs = (stride[varp->ndims-1] == 1) ? 1 : count[varp->ndims-1];
+        j = IS_RECVAR(varp) ? 1 : 0;
+        for (i=j; i<varp->ndims-1; i++)
+            req->npairs *= count[i];
     }
 
     /* set the properties of non-lead request */
     req->xbuf   = xbuf;
     req->nelems = nelems;
 
+    /* special treatment when there is only one offset-length pair */
+    if (req->npairs == 1 && varp->ndims > 0) {
+        ncmpio_calc_off(ncp, varp, start, &req->offset_start);
+        req->offset_end = req->nelems * varp->xsz;
+    }
+
     if (IS_RECVAR(varp)) {
         /* save the last record number accessed */
         if (stride == NULL)
@@ -576,6 +603,8 @@ ncmpio_igetput_varm(NC               *ncp,
                                                      : ncp->get_lead_list;
 
             req->nelems /= count[0];
+            if (req->npairs == 1)
+                req->offset_end = req->nelems * varp->xsz;
 
             /* add (count[0]-1) number of (sub)requests */
             ncmpio_add_record_requests(lead_list, req, count[0], stride);
diff --git a/src/drivers/ncmpio/ncmpio_i_varn.m4 b/src/drivers/ncmpio/ncmpio_i_varn.m4
index be9af9752..8bad268f0 100644
--- a/src/drivers/ncmpio/ncmpio_i_varn.m4
+++ b/src/drivers/ncmpio/ncmpio_i_varn.m4
@@ -452,6 +452,12 @@ igetput_varn(NC                *ncp,
     lead_req->max_rec     = -1;
     lead_req->nonlead_num = new_nreqs;
 
+#if 0
+MPI_Aint addr;
+MPI_Get_address(lead_req->xbuf, &addr);
+printf("%s at %d: lead_req xbuf=%ld nelems=%lld\n",__func__,__LINE__, addr,lead_req->nelems);
+#endif
+
     /* varn APIs have no argument stride */
     fSet(lead_req->flag, NC_REQ_STRIDE_NULL);
 
@@ -466,6 +472,8 @@ igetput_varn(NC                *ncp,
     xbufp = (char*)xbuf;
 
     for (i=0; i<num; i++) {
+        req->npairs = 0;
+
         if (req_nelems[i] == 0) continue; /* ignore this 0-length request i */
 
         req->nelems    = req_nelems[i];
@@ -473,11 +481,17 @@ igetput_varn(NC                *ncp,
         req->xbuf      = xbufp;
         xbufp         += req_nelems[i] * xsize;
 
+#if 0
+MPI_Get_address(req->xbuf, &addr);
+printf("%s at %d: req i=%d xbuf=%ld off=%ld nelems=%lld\n",__func__,__LINE__, i,addr,(char*)req->xbuf - (char*)xbuf,req->nelems);
+#endif
+
         /* copy starts[i] and counts[i] over to req */
         req->start = start_ptr;
         memcpy(start_ptr, starts[i], memChunk);
         start_ptr += varp->ndims; /* count[] */
         if (counts == NULL || counts[i] == NULL) {
+            /* counts == NULL, equivalent to all 1s */
             for (j=0; j<varp->ndims; j++)
                  start_ptr[j] = 1; /* start_ptr is now counts[] */
         }
@@ -492,6 +506,24 @@ igetput_varn(NC                *ncp,
             if (counts == NULL || counts[i] == NULL) num_rec = 1;
             else                                     num_rec = counts[i][0];
 
+            /* calculate number of flattened offset-length pairs */
+            req->npairs = 1;
+            if (counts == NULL || counts[i] == NULL) {
+                /* equivalent to all multiple var1 APIs */
+                ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start);
+                // req->offset_end = req->offset_start + varp->xsz;
+                req->offset_end = varp->xsz;
+            }
+            else {
+                for (j=1; j<varp->ndims-1; j++)
+                    req->npairs *= counts[i][j];
+                /* special treatment for when there is only one pair */
+                if (req->npairs == 1) {
+                    ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start);
+                    req->offset_end = req->nelems * varp->xsz;
+                }
+            }
+
             max_rec = starts[i][0] + num_rec;
             lead_req->max_rec = MAX(lead_req->max_rec, max_rec);
 
@@ -506,6 +538,11 @@ igetput_varn(NC                *ncp,
                 lead_list = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list
                                                          : ncp->get_lead_list;
 
+
+                req->nelems /= counts[i][0];
+                if (req->npairs == 1)
+                    req->offset_end = req->nelems * varp->xsz;
+
                 /* append (counts[i][0]-1) number of requests to the queue */
                 ncmpio_add_record_requests(lead_list, req, counts[i][0], NULL);
                 start_ptr += (counts[i][0] - 1) * 2 * varp->ndims;
@@ -514,8 +551,26 @@ igetput_varn(NC                *ncp,
             else
                 req++;
         }
-        else
+        else {
+            /* calculate number of flattened offset-length pairs */
+            req->npairs = 1;
+            if (counts == NULL || counts[i] == NULL) {
+                /* equivalent to all multiple var1 APIs */
+                ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start);
+                // req->offset_end = req->offset_start + varp->xsz;
+                req->offset_end = varp->xsz;
+            }
+            else {
+                for (j=0; j<varp->ndims-1; j++)
+                    req->npairs *= counts[i][j];
+                /* special treatment for when there is only one pair */
+                if (req->npairs == 1) {
+                    ncmpio_calc_off(ncp, varp, starts[i], &req->offset_start);
+                    req->offset_end = req->nelems * varp->xsz;
+                }
+            }
             req++;
+        }
     }
 
     if (reqid != NULL) *reqid = lead_req->id;
diff --git a/src/drivers/ncmpio/ncmpio_intra_node.c b/src/drivers/ncmpio/ncmpio_intra_node.c
index 90d613eb4..38b8c38dd 100644
--- a/src/drivers/ncmpio/ncmpio_intra_node.c
+++ b/src/drivers/ncmpio/ncmpio_intra_node.c
@@ -3,31 +3,40 @@
  *  See COPYRIGHT notice in top-level directory.
  *
  * This file contains the implementation of intra-node aggregation feature,
- * which is designed for the I/O patterns that contain many noncontiguous
- * requests interleaved among processes, and spreading across a wide range of
- * file space. It is particularly useful when the number of MPI processes
- * allocated to a compute node is large.
+ * which is designed to improve performance for I/O patterns that contain many
+ * noncontiguous requests interleaved among processes, with a wide aggregate
+ * access region on each process that involves file stripes responsible by
+ * almost all the file servers. By reducing the number of processes per node
+ * to participate MPI-IO operations, this feature can effectively reduce the
+ * communication contention, particularly often happened to jobs that run a
+ * large the number of MPI processes per compute node.
  *
- * This feature is enabled by setting the PnetCDF hint 'nc_num_aggrs_per_node'
- * to a positive integral value indicating the desired number of processes per
- * compute node to be selected as the intra-node I/O aggregators. Each process
- * is assigned a unique aggregator. The non-aggregators send their requests to
- * the assigned aggregators, and then the aggregators make MPI-IO requests to
- * the file.
+ * Users can enable this feature by setting the PnetCDF I/O hint named
+ * 'nc_num_aggrs_per_node' to a positive integral value, indicating the desired
+ * number of processes per compute node to be selected as the intra-node I/O
+ * aggregators. Processes running on the same node are divided into groups.
+ * The process with the lowest rank ID is selected as the I/O aggregator of
+ * that group. Non-aggregators send their requests to their aggregators, and
+ * then the aggregators make I/O requests to the file, i.e. only aggregators
+ * make MPI-IO calls.
  *
- * Such strategy can effectively reduce communication congestion due to many
- * pending asynchronous messages produced in the collective write inside of
- * MPI-IO.
+ * Because communication within a node can be achieved by memory copy operation
+ * and thus its cost is expected to be much lower than the inter-node
+ * communication, this feature can effectively reduce the communication
+ * congestion or exhaustion of message queues, due to many pending asynchronous
+ * messages produced in the two-phase I/O, the strategy used to implement
+ * MPI collective I/O.
  *
- * The concept of intra-node request aggregation is based on the paper:
+ * The concept of intra-node request aggregation and its performance results
+ * are presented in the following paper.
  * Q. Kang, S. Lee, K. Hou, R. Ross, A. Agrawal, A. Choudhary, and W. Liao.
  * Improving MPI Collective I/O for High Volume Non-Contiguous Requests With
  * Intra-Node Aggregation. IEEE Transactions on Parallel and Distributed
- * Systems (TPDS), 31(11):2682-2695, November 2020.
+ * Systems, 31(11):2682-2695, November 2020.
  */
 
 #ifdef HAVE_CONFIG_H
-# include <config.h>
+#include <config.h>
 #endif
 
 #include <stdio.h>
@@ -41,28 +50,34 @@
 #include <common.h>
 #include "ncmpio_NC.h"
 
+/* swap values of x and y */
+#define SWAP1(x, y, tmp) { tmp = x ; x = y; y = tmp ; }
+
 #ifdef HAVE_MPI_LARGE_COUNT
+/* swap elements of arrays x, y, and corresponding lengths and bufAddr */
 #define SWAP(offsets, lengths, bufAddr, x, y) { \
     MPI_Count aint; \
     MPI_Count cint; \
     MPI_Count d0 = (x) - offsets; \
     MPI_Count d1 = (y) - offsets; \
     if (d0 != d1) { \
-        cint = *(x) ; *(x) = *(y) ; *(y) = cint ; \
-        cint = lengths[d0] ; lengths[d0] = lengths[d1] ; lengths[d1] = cint ; \
-        aint = bufAddr[d0] ; bufAddr[d0] = bufAddr[d1] ; bufAddr[d1] = aint ; \
+        SWAP1(*(x), *(y), cint); \
+        SWAP1(lengths[d0], lengths[d1], cint); \
+        if (bufAddr != NULL) \
+            SWAP1(bufAddr[d0], bufAddr[d1], aint); \
     } \
 }
 #else
 #define SWAP(offsets, lengths, bufAddr, x, y) { \
     int int4; \
-    MPI_Aint aint; \
-    MPI_Aint d0 = (x) - offsets; \
-    MPI_Aint d1 = (y) - offsets; \
+    MPI_Offset aint; \
+    MPI_Offset d0 = (x) - offsets; \
+    MPI_Offset d1 = (y) - offsets; \
     if (d0 != d1) { \
-        aint = *(x) ; *(x) = *(y) ; *(y) = aint ; \
-        int4 = lengths[d0] ; lengths[d0] = lengths[d1] ; lengths[d1] = int4 ; \
-        aint = bufAddr[d0] ; bufAddr[d0] = bufAddr[d1] ; bufAddr[d1] = aint ; \
+        SWAP1(*(x), *(y), aint); \
+        SWAP1(lengths[d0], lengths[d1], int4); \
+        if (bufAddr != NULL) \
+            SWAP1(bufAddr[d0], bufAddr[d1], aint); \
     } \
 }
 #endif
@@ -71,28 +86,36 @@
                       ((*(b) < *(c)) ? (b) : ((*(a) < *(c)) ? (c) : (a))) : \
                       ((*(b) > *(c)) ? (b) : ((*(a) < *(c)) ? (a) : (c))))
 
+static
+size_t bin_search(
+#ifdef HAVE_MPI_LARGE_COUNT
+                  MPI_Count key, MPI_Count *base,
+#else
+                  MPI_Offset key, MPI_Offset *base,
+#endif
+                  size_t nmemb);
+
 /*----< qsort_off_len_buf() >------------------------------------------------*/
-/* Sort three arrays of offsets, lengths, and buffer addresses based on the
- * increasing order of offsets. This code is based on the qsort routine from
- * Bentley & McIlroy's "Engineering a Sort Function".
+/* Sort three arrays of offsets, lengths, and buffer addresses based on array
+ * offsets into an increasing order. This code is based on the qsort routine
+ * from Bentley & McIlroy's "Engineering a Sort Function".
  */
 static void
-qsort_off_len_buf(MPI_Aint num,
+qsort_off_len_buf(MPI_Aint    num,
 #ifdef HAVE_MPI_LARGE_COUNT
-                  MPI_Count *offsets,
-                  MPI_Count *lengths,
+                  MPI_Count  *offsets,
+                  MPI_Count  *lengths,
 #else
-                  MPI_Aint  *offsets,
-                  int       *lengths,
+                  MPI_Offset *offsets,
+                  int        *lengths,
 #endif
-                  MPI_Aint  *bufAddr)
+                  MPI_Aint   *bufAddr)
 {
 #ifdef HAVE_MPI_LARGE_COUNT
-    MPI_Count *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt;
+    MPI_Count *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt, i, r;
 #else
-    MPI_Aint *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt;
+    MPI_Offset *pa, *pb, *pc, *pd, *pl, *pm, *pn, cmp_result, swap_cnt, i, r;
 #endif
-    MPI_Aint i, r;
 
     while (1) {
         swap_cnt = 0;
@@ -155,7 +178,8 @@ qsort_off_len_buf(MPI_Aint num,
         if ((r = pd - pc) > 1) {
             /* Iterate rather than recursively call self to save stack space */
             lengths = lengths + (num - r);
-            bufAddr = bufAddr + (num - r);
+            if (bufAddr != NULL)
+                bufAddr = bufAddr + (num - r);
             offsets = pn - r;
             num = r;
         }
@@ -164,174 +188,238 @@ qsort_off_len_buf(MPI_Aint num,
     }
 }
 
-/*----< ncmpio_init_intra_node_aggr() >--------------------------------------*/
-/* When intra-node write aggregation is enabled, processes on the same node
- * will be divided into groups. The number of groups is the number of
- * aggregators on that node. The rank IDs of each group must be established.
+/*----< heap_merge() >-------------------------------------------------------*/
+/* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 modified for a
+ * heap with smallest element at root. The recursion has been removed so that
+ * there are no function calls. Function calls are too expensive.
  *
- * 1. Find information about MPI processes and their affinity to compute node.
- * 2. Determine whether self process is an intra-node aggregator.
- * 3. For an aggregator, find the number of non-aggregators assigned to it and
- *    construct rank IDs of assigned non-aggregators.
- * 4. For a non-aggregator, find the rank ID of its assigned aggregator.
+ * Requirement: all individual offsets lists must be already sorted !!!
  */
-int
-ncmpio_intra_node_aggr_init(NC *ncp)
+static
+void heap_merge(int              nprocs,
+                const MPI_Aint  *count,    /* [nprocs] */
+                MPI_Aint         nelems,
+#ifdef HAVE_MPI_LARGE_COUNT
+                MPI_Count       *offsets,  /* [nelems] */
+                MPI_Count       *blklens,  /* [nelems] */
+#else
+                MPI_Offset      *offsets,  /* [nelems] */
+                int             *blklens,  /* [nelems] */
+#endif
+                MPI_Aint        *bufAddr)  /* [nelems] */
 {
-    char my_procname[MPI_MAX_PROCESSOR_NAME], **all_procnames=NULL;
-    int i, j, k, my_procname_len, num_nodes, root=0;
-    int *node_ids=NULL, *all_procname_lens=NULL, *nprocs_per_node;
-    int naggrs_my_node, num_nonaggrs;
-    int my_rank_index, *ranks_my_node, my_node_id, nprocs_my_node;
-
-    /* initialize parameters of local-node aggregation */
-    ncp->my_aggr = -1;         /* rank ID of my aggregator */
-    ncp->num_nonaggrs = 0;     /* number of non-aggregators assigned */
-    ncp->nonaggr_ranks = NULL; /* ranks of assigned non-aggregators */
-
-#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
-    ncp->aggr_time = 0.0;
+    typedef struct {
+#ifdef HAVE_MPI_LARGE_COUNT
+        MPI_Count  *off_list;
+        MPI_Count  *len_list;
+#else
+        MPI_Offset *off_list;
+        int        *len_list;
 #endif
+        MPI_Aint  *addr_list;
+        MPI_Aint  count;
+    } heap_struct;
 
-    if (ncp->num_aggrs_per_node == 0 || ncp->num_aggrs_per_node == ncp->nprocs)
-        /* disable intra-node aggregation */
-        return NC_NOERR;
+    heap_struct *a, tmp;
+    int i, j, heapsize, l, r, k, smallest;
+    size_t sum;
 
-#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
-    double timing = MPI_Wtime();
+    /* This heap_merge is not in-place, taking too much memory footprint */
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Count *srt_off = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nelems);
+    MPI_Count *srt_len = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * nelems);
+#else
+    MPI_Aint *srt_off = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * nelems);
+    int      *srt_len = (int*)      NCI_Malloc(sizeof(int)      * nelems);
 #endif
+    MPI_Aint *srt_addr = NULL;
+
+    if (bufAddr != NULL)
+        srt_addr = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * nelems);
+
+    a = (heap_struct *) NCI_Calloc(nprocs, sizeof(heap_struct));
+
+    /* there are nprocs number of lists to be merged */
+    j = 0;
+    sum = 0;
+    for (i = 0; i < nprocs; i++) {
+        if (count[i]) {
+            /* each of a[j].off_list is already sorted */
+            a[j].off_list = offsets + sum;
+            a[j].len_list = blklens + sum;
+            if (bufAddr != NULL)
+                a[j].addr_list = bufAddr + sum;
+            sum += count[i];
+            a[j].count = count[i];
+            j++;
+        }
+    }
+    nprocs = j; /* some count[i] may be zero */
 
-    /* allocate space for storing the rank IDs of non-aggregators assigned to
-     * this rank. Note ncp->nonaggr_ranks[] will be freed when closing the
-     * file, if allocated.
-     */
-    num_nonaggrs = ncp->nprocs / ncp->num_aggrs_per_node + 1;
-    ncp->nonaggr_ranks = (int*) NCI_Malloc(sizeof(int) * num_nonaggrs);
+#define SWAP_HEAP(x, y, tmp) { tmp = x ; x = y ; y = tmp ; }
 
-    /* Collect info about compute nodes in order to select I/O aggregators.
-     * Note my_procname is null character terminated, but my_procname_len
-     * does not include the null character.
+    heapsize = nprocs;
+
+    /* Build a heap out of the first element from each list, with the smallest
+     * element of the heap at the root. The first for loop is to find and move
+     * the smallest a[*].off_list[0] to a[0].
      */
-    MPI_Get_processor_name(my_procname, &my_procname_len);
-    my_procname_len++; /* to include terminate null character */
+    for (i = heapsize / 2 - 1; i >= 0; i--) {
+        k = i;
+        for (;;) {
+            r = 2 * (k + 1);
+            l = r - 1;
+            if (l < heapsize && a[l].off_list[0] < a[k].off_list[0])
+                smallest = l;
+            else
+                smallest = k;
 
-    if (ncp->rank == root) {
-        /* root collects all procnames */
-        all_procnames = (char **) NCI_Malloc(sizeof(char*) * ncp->nprocs);
-        if (all_procnames == NULL)
-            DEBUG_RETURN_ERROR(NC_ENOMEM)
+            if (r < heapsize && a[r].off_list[0] < a[smallest].off_list[0])
+                smallest = r;
 
-        all_procname_lens = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs);
-        if (all_procname_lens == NULL) {
-            NCI_Free(all_procnames);
-            DEBUG_RETURN_ERROR(NC_ENOMEM)
+            if (smallest != k) {
+                SWAP_HEAP(a[k], a[smallest], tmp);
+                k = smallest;
+            } else
+                break;
         }
     }
-    /* gather process name lengths from all processes first */
-    MPI_Gather(&my_procname_len, 1, MPI_INT, all_procname_lens, 1, MPI_INT,
-               root, ncp->comm);
 
-    if (ncp->rank == root) {
-        int *disp;
-        size_t alloc_size = 0;
+    /* The heap keeps the smallest element in its first element, i.e.
+     * a[0].off_list[0].
+     */
+    j = 0;
+    for (i = 0; i < nelems; i++) {
+        /* extract smallest element from heap, i.e. the root */
+        srt_off[i] = a[0].off_list[0];
+        srt_len[i] = a[0].len_list[0];
+        if (bufAddr != NULL)
+            srt_addr[i] = a[0].addr_list[0];
+        a[0].count--;
+
+        if (!a[0].count) {
+            a[0] = a[heapsize - 1];
+            heapsize--;
+        } else {
+            a[0].off_list++;
+            a[0].len_list++;
+            if (bufAddr != NULL)
+                a[0].addr_list++;
+        }
 
-        for (i=0; i<ncp->nprocs; i++)
-            alloc_size += all_procname_lens[i];
+        /* Heapify(a, 0, heapsize); */
+        k = 0;
+        for (;;) {
+            r = 2 * (k + 1);
+            l = r - 1;
+            if (l < heapsize && a[l].off_list[0] < a[k].off_list[0])
+                smallest = l;
+            else
+                smallest = k;
 
-        all_procnames[0] = (char *) NCI_Malloc(alloc_size);
-        if (all_procnames[0] == NULL) {
-            NCI_Free(all_procname_lens);
-            NCI_Free(all_procnames);
-            DEBUG_RETURN_ERROR(NC_ENOMEM)
-        }
+            if (r < heapsize && a[r].off_list[0] < a[smallest].off_list[0])
+                smallest = r;
 
-        /* Construct displacement array for the MPI_Gatherv, as each process
-         * may have a different length for its process name.
-         */
-        disp = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs);
-        disp[0] = 0;
-        for (i=1; i<ncp->nprocs; i++) {
-            all_procnames[i] = all_procnames[i - 1] + all_procname_lens[i - 1];
-            disp[i] = disp[i - 1] + all_procname_lens[i - 1];
+            if (smallest != k) {
+                SWAP_HEAP(a[k], a[smallest], tmp);
+                k = smallest;
+            } else
+                break;
         }
+    }
 
-        /* gather all process names */
-        MPI_Gatherv(my_procname, my_procname_len, MPI_CHAR,
-                    all_procnames[0], all_procname_lens, disp, MPI_CHAR,
-                    root, ncp->comm);
+#ifdef HAVE_MPI_LARGE_COUNT
+    memcpy(offsets, srt_off, sizeof(MPI_Count) * nelems);
+    memcpy(blklens, srt_len, sizeof(MPI_Count) * nelems);
+#else
+    memcpy(offsets, srt_off, sizeof(MPI_Offset) * nelems);
+    memcpy(blklens, srt_len, sizeof(int)        * nelems);
+#endif
+    if (bufAddr != NULL)
+        memcpy(bufAddr, srt_addr, sizeof(MPI_Aint) * nelems);
+
+    NCI_Free(a);
+    if (bufAddr != NULL) NCI_Free(srt_addr);
+    NCI_Free(srt_len);
+    NCI_Free(srt_off);
+}
 
-        NCI_Free(disp);
-        NCI_Free(all_procname_lens);
-    } else
-        /* send process name to root */
-        MPI_Gatherv(my_procname, my_procname_len, MPI_CHAR,
-                    NULL, NULL, NULL, MPI_CHAR, root, ncp->comm);
-
-    /* each MPI process's compute node ID */
-    node_ids = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs);
-
-    if (ncp->rank == root) {
-        /* all_procnames[] can tell us the number of nodes and number of
-         * processes per node.
-         */
-        char **node_names;
-        int last;
-
-        /* array of pointers pointing to unique host names (compute nodes) */
-        node_names = (char **) NCI_Malloc(sizeof(char*) * ncp->nprocs);
-
-        /* number of MPI processes running on each node */
-        nprocs_per_node = (int *) NCI_Malloc(sizeof(int) * ncp->nprocs);
-
-        /* calculate nprocs_per_node[] and node_ids[] */
-        last = 0;
-        num_nodes = 0; /* number of unique compute nodes */
-        for (i=0; i<ncp->nprocs; i++) {
-            k = last;
-            for (j=0; j<num_nodes; j++) {
-                /* check if [i] has already appeared in [] */
-                if (!strcmp(all_procnames[i], node_names[k])) { /* found */
-                    node_ids[i] = k;
-                    break;
-                }
-                k = (k == num_nodes - 1) ? 0 : k + 1;
-            }
-            if (j < num_nodes)  /* found, next iteration, start with node n */
-                last = k;
-            else {      /* not found, j == num_nodes, add a new node */
-                node_names[j] = strdup(all_procnames[i]);
-                nprocs_per_node[j] = 1;
-                node_ids[i] = j;
-                last = j;
-                num_nodes++;
-            }
-        }
-        /* num_nodes is now the number of compute nodes (unique node names) */
 
-        NCI_Free(nprocs_per_node);
+/*----< ncmpio_ina_init() >--------------------------------------------------*/
+/* When intra-node write aggregation is enabled, this subroutine initializes
+ * the metadata to be used for intra-node communication and I/O requests.
+ *
+ * Processes on the same node will first be divided into groups. A process with
+ * the lowest rank ID in a group is selected as the aggregator. Only the
+ * aggregators call the MPI-IO functions to perform I/O to the file. Thus, this
+ * subroutine must be called before MPI_File_open() and should be called only
+ * once at ncmpio_create() or ncmpio_open().
+ *
+ * The subroutine performs the following tasks.
+ * 1. Make use of the affinity of each MPI process to its compute node,
+ *    represented by ncp->num_nodes and ncp->node_ids[]. These two member of
+ *    ncp should have been set from a call to ncmpii_construct_node_list()
+ *    earlier during ncmpio_create() and ncmpio_open().
+ *    + ncp->num_nodes is the number of unique compute nodes.
+ *    + ncp->node_ids[ncp->nprocs] contains node IDs for all processes.
+ * 2. Divide processes into groups, select aggregators, and determine whether
+ *    self process is an intra-node aggregator.
+ *    + ncp->my_aggr is rank ID of my aggregator.
+ *    + if (ncp->my_aggr == ncp->rank) then this rank is an aggregator.
+ * 3. For an aggregator, find the number of non-aggregators assigned to it and
+ *    construct a list of rank IDs of non-aggregators of its group.
+ *    + ncp->num_nonaggrs is the number of non-aggregators in its group.
+ * 4. For a non-aggregator, find the rank ID of its assigned aggregator.
+ *    + ncp->my_aggr is rank ID of my aggregator.
+ *    + ncp->nonaggr_ranks[] contains the rank IDs of assigned non-aggregators.
+ * 5. Create a new MPI communicator consisting of only the aggregators only.
+ *    Obtain the rank ID and total process number of the new communicator.
+ *    + ncp->ina_comm contains the aggregators across all nodes.
+ *    + ncp->ina_nprocs is the number of processes in intra-node communicator.
+ *    + ncp->ina_rank is this process's rank ID in intra-node communicator.
+ */
+int
+ncmpio_ina_init(NC *ncp)
+{
+    int i, j, mpireturn, do_io, ina_nprocs, naggrs_my_node, first_rank;
+    int my_rank_index, *ranks_my_node, my_node_id, nprocs_my_node;
 
-        for (i=0; i<num_nodes; i++)
-            free(node_names[i]); /* allocated by strdup() */
-        NCI_Free(node_names);
-        NCI_Free(all_procnames[0]);
-        NCI_Free(all_procnames);
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    double timing = MPI_Wtime();
+    int nelems = sizeof(ncp->ina_time_put) / sizeof(ncp->ina_time_put[0]);
+    ncp->ina_time_init = ncp->ina_time_flatten = 0.0;
+    for (i=0; i<nelems; i++) {
+        ncp->ina_time_put[i] = ncp->ina_time_get[i] = 0;
+        ncp->maxmem_put[i] = ncp->maxmem_get[i] = 0;
     }
+    ncp->ina_npairs_put = ncp->ina_npairs_get = 0;
+#endif
+
+    /* initialize parameters of intra-node aggregation */
+    ncp->my_aggr = -1;         /* rank ID of my aggregator */
+    ncp->num_nonaggrs = 0;     /* number of non-aggregators assigned */
+    ncp->nonaggr_ranks = NULL; /* ranks of assigned non-aggregators */
 
-    MPI_Bcast(node_ids, ncp->nprocs, MPI_INT, root, ncp->comm);
+    /* Note that ill value of ncp->num_aggrs_per_node has been checked before
+     * entering this subroutine. Thus ncp->num_aggrs_per_node must be > 0.
+     */
 
-    /* my_node_id is this rank's node ID */
-    my_node_id = node_ids[ncp->rank];
+    /* ncp->node_ids[] has been established in ncmpii_construct_node_list()
+     * called in ncmpio_create() or ncmpio_open() before entering this
+     * subroutine. my_node_id is this rank's node ID.
+     */
+    my_node_id = ncp->node_ids[ncp->rank];
 
-    /* nprocs_my_node: the number of processes in my nodes
+    /* nprocs_my_node:  the number of processes in my nodes
      * ranks_my_node[]: rank IDs of all processes in my node.
-     * my_rank_index points to ranks_my_node[] where
-     * ranks_my_node[my_rank_index] == ncp->rank
+     * my_rank_index:   points to ranks_my_node[] where
+     *                  ranks_my_node[my_rank_index] == ncp->rank
      */
     ranks_my_node = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs);
     my_rank_index = -1;
     nprocs_my_node = 0;
     for (i=0; i<ncp->nprocs; i++) {
-        if (node_ids[i] == my_node_id) {
+        if (ncp->node_ids[i] == my_node_id) {
             if (i == ncp->rank)
                 my_rank_index = nprocs_my_node;
             ranks_my_node[nprocs_my_node] = i;
@@ -339,80 +427,166 @@ ncmpio_intra_node_aggr_init(NC *ncp)
         }
     }
     assert(my_rank_index >= 0);
-
     /* Now, ranks_my_node[my_rank_index] == ncp->rank */
 
-    NCI_Free(node_ids);
-
-    /* make sure number of aggregators in my node <= nprocs_my_node */
+    /* Make sure number of aggregators in my node <= nprocs_my_node. In some
+     * cases, the number of processes allocated to the last few nodes can be
+     * less than others.
+     */
     naggrs_my_node = MIN(ncp->num_aggrs_per_node, nprocs_my_node);
 
-    /* calculate the number of non-aggregators assigned to an aggregator.
-     * Note num_nonaggrs includes self.
+    /* For each aggregation group, calculate the number of non-aggregators,
+     * ncp->num_nonaggrs. Note ncp->num_nonaggrs includes self rank.
      */
-    num_nonaggrs = nprocs_my_node / naggrs_my_node;
-    if (nprocs_my_node % naggrs_my_node) num_nonaggrs++;
-
-    if (num_nonaggrs == 1)
-        /* disable aggregation if the number of non-aggregators assigned to
-         * this aggregator is 1. Note num_nonaggrs includes self. It is
-         * possible for aggregation enabled or disabled on different nodes and
-         * even different aggregation groups on the same node.
+    ncp->num_nonaggrs = nprocs_my_node / naggrs_my_node;
+    if (nprocs_my_node % naggrs_my_node) ncp->num_nonaggrs++;
+
+    /* Adjust the number of non-aggregators for the last group of each node,
+     * to make sure it does not go beyond nprocs_my_node.
+     */
+    first_rank = my_rank_index - my_rank_index % ncp->num_nonaggrs;
+    ncp->num_nonaggrs = MIN(ncp->num_nonaggrs, nprocs_my_node - first_rank);
+
+    /* Assign the first rank as the intra-node aggregator of this group and
+     * set the rank ID of my aggregator for each process.
+     */
+    ncp->my_aggr = ranks_my_node[first_rank];
+
+    if (ncp->num_nonaggrs == 1) {
+        /* When the number of processes in this group is 1, the aggregation
+         * is not performed. Note num_nonaggrs includes self rank.
          *
-         * Use whether ncp->my_aggr < 0 to tell if aggregation is disabled or
-         * enabled.
+         * Note this does not mean intra-node aggregation is disabled. The
+         * indicator of whether intra-node aggregation is enabled or disabled
+         * is ncp->num_aggrs_per_node, whose value should be consistent across
+         * all processes. It is possible for some groups containing only one
+         * process, in which the aggregation is not necessarily performed
+         * within that group.
          */
-        ncp->my_aggr = -1;
-    else {
-        /* find the rank ID of aggregator assigned to this rank */
-        ncp->my_aggr = ranks_my_node[my_rank_index - my_rank_index % num_nonaggrs];
+        assert(ncp->my_aggr == ncp->rank);
+    }
+    else if (ncp->my_aggr == ncp->rank) { /* ncp->num_nonaggrs > 1 */
+        /* Construct ncp->nonaggr_ranks[], the rank IDs of non-aggregators of
+         * this group. Note ncp->nonaggr_ranks[], if malloc-ed, will only be
+         * freed when closing the file.
+         */
+        ncp->nonaggr_ranks = (int*)NCI_Malloc(sizeof(int) * ncp->num_nonaggrs);
 
-        if (ncp->my_aggr == ncp->rank) { /* this rank is an aggregator */
-            /* Set the number of non-aggregators assigned to this rank. For the
-             * last group, make sure it does not go beyond nprocs_my_node.
+        memcpy(ncp->nonaggr_ranks, ranks_my_node + first_rank,
+               sizeof(int) * ncp->num_nonaggrs);
+    }
+    NCI_Free(ranks_my_node);
+
+    /* Next step is to construct a new MPI communicator consisting of all
+     * intra-node aggregators. It will later be used to call MPI_File_open(),
+     * so that only aggregators call MPI-IO functions to access the file.
+     *
+     * When using the PnetCDF's internal PNCIO driver, we can pass a list of
+     * node_ids of the new communicator to the PNCIO file handler,
+     * ncp->pncio_fh, so to prevent the driver from the repeated work of
+     * constructing the list of node IDs, node_ids. If using MPI-IO driver,
+     * then ROMIO will do this internally again anyway.
+     */
+
+    do_io = (ncp->my_aggr == ncp->rank) ? 1 : 0;
+
+    /* construct an array containing ranks of aggregators */
+    ncp->ina_node_list = (int*) NCI_Malloc(sizeof(int) * ncp->nprocs);
+    TRACE_COMM(MPI_Allgather)(&do_io, 1, MPI_INT, ncp->ina_node_list, 1,
+                              MPI_INT,ncp->comm);
+
+    /* Calculate the total number of intra-node aggregators */
+    for (ina_nprocs=0, i=0; i<ncp->nprocs; i++)
+        if (ncp->ina_node_list[i]) ina_nprocs++;
+
+    /* Construct ncp->node_ids[] and ncp->ina_node_list[]. Their contents
+     * depend on the layout of MPI process allocation to the compute nodes.
+     * The common layouts can be two kinds:
+     *   + cyclic - MPI ranks are assigned to nodes round-robin-ly,
+     *   + block - MPI ranks are assigned to a node and then move on to next.
+     *
+     * Below uses an example of nodes=3, nprocs=10, * num_aggrs_per_node=2.
+     * ncp->node_ids[] should be
+     *     block  process allocation: 0,0,0,0,1,1,1,2,2,2
+     *     cyclic process allocation: 0,1,2,0,1,2,0,1,2,0
+     * Accordingly, ncp->ina_node_list[] can be two kinds
+     *     block  process allocation: 1,0,1,0,1,0,1,1,0,1
+     *     cyclic process allocation: 1,1,1,0,0,0,1,1,1,0
+     */
+
+    /* ncp->node_ids[]: node IDs of processes in the new MPI communicator.
+     * ncp->ina_node_list[]: the rank IDs of the new MPI communicator.
+     */
+    for (j=0,i=0; i<ncp->nprocs; i++) {
+        if (ncp->ina_node_list[i]) {
+            ncp->ina_node_list[j] = i;
+            /* Modify ncp->node_ids[] to store the node IDs of the processes in
+             * the new communicator. Note ncp->node_ids[] from now on is used
+             * by PnetCDF's PNCIO driver only.
              */
-            ncp->num_nonaggrs = MIN(num_nonaggrs, nprocs_my_node - my_rank_index);
-            if (ncp->num_nonaggrs == 1)
-                /* disable aggregation, as this aggregation group contains only
-                 * self rank
-                 */
-                ncp->my_aggr = -1;
-            else
-                /* copy the rank IDs over to ncp->nonaggr_ranks[] */
-                memcpy(ncp->nonaggr_ranks,
-                       ranks_my_node + my_rank_index,
-                       sizeof(int) * num_nonaggrs);
+            ncp->node_ids[j] = ncp->node_ids[i];
+            j++;
         }
     }
-    NCI_Free(ranks_my_node);
 
-    if (ncp->my_aggr < 0) {
-        /* free ncp->nonaggr_ranks if aggregation is not enabled */
-        NCI_Free(ncp->nonaggr_ranks);
-        ncp->nonaggr_ranks = NULL;
+    /* Make MPI calls to create a new communicator. */
+    MPI_Group origin_group, ina_group;
+    TRACE_COMM(MPI_Comm_group)(ncp->comm, &origin_group);
+    if (mpireturn != MPI_SUCCESS)
+        return ncmpii_error_mpi2nc(mpireturn, "MPI_Comm_group");
+    TRACE_COMM(MPI_Group_incl)(origin_group, ina_nprocs, ncp->ina_node_list, &ina_group);
+    if (mpireturn != MPI_SUCCESS)
+        return ncmpii_error_mpi2nc(mpireturn, "MPI_Group_incl");
+    TRACE_COMM(MPI_Comm_create)(ncp->comm, ina_group, &ncp->ina_comm);
+    if (mpireturn != MPI_SUCCESS)
+        return ncmpii_error_mpi2nc(mpireturn, "MPI_Comm_create");
+    TRACE_COMM(MPI_Group_free)(&ina_group);
+    if (mpireturn != MPI_SUCCESS)
+        return ncmpii_error_mpi2nc(mpireturn, "MPI_Group_free");
+    TRACE_COMM(MPI_Group_free)(&origin_group);
+    if (mpireturn != MPI_SUCCESS)
+        return ncmpii_error_mpi2nc(mpireturn, "MPI_Group_free");
+
+    /* Non-aggregators will have ncp->ina_comm set to MPI_COMM_NULL */
+    if (ncp->ina_comm == MPI_COMM_NULL) {
+        ncp->ina_nprocs = 0;
+        ncp->ina_rank = -1;
+    }
+    else {
+        MPI_Comm_size(ncp->ina_comm, &ncp->ina_nprocs);
+        MPI_Comm_rank(ncp->ina_comm, &ncp->ina_rank);
     }
 
-    /* TODO: For automatically determine Whether to enable intra-node write
-     * aggregation, this should be done right before each collective write
-     * call.
-     *   1. obtain hint cb_noddes, and striping_unit
+    /* TODO: automatically determine whether or not to enable intra-node
+     * aggregation.
+     *
+     * The ideal case is it can be determined right before each collective
+     * write call, because only at that time, the communication pattern is
+     * known. If the pattern can cause contention, then enable it. Otherwise,
+     * disable it.
+     *
+     * Such mechanism may depends on the followings.
+     *   1. MPI-IO hint cb_noddes, and striping_unit
      *   2. calculate aggregate access region
-     * In each round of two-phase I/O, when the number of senders to each
-     * cb_nodes is very large, then intra-node aggregation should be enabled.
-     * Average of all nprocs_per_node may be a factor for determining whether
-     * to enable intra-node aggregation. It indicates whether the high number
-     * of processes are allocated on the same node.
+     *   3. If the number of senders to each cb_nodes is very large, then
+     *      intra-node aggregation should be enabled.
+     *   4. Average of nprocs_per_node across all processes may be a factor for
+     *      determining whether to enable intra-node aggregation. It indicates
+     *      whether the high number of processes are allocated on the same
+     *      node.
      */
 
 #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
-    ncp->aggr_time = MPI_Wtime() - timing;
+    ncp->ina_time_init = MPI_Wtime() - timing;
 #endif
 
     return NC_NOERR;
 }
 
 /*----< flatten_subarray() >-------------------------------------------------*/
-/* flatten a subarray request into a list of offset-length pairs */
+/* Flatten a subarray request, specified by start[], count[], and stride[] into
+ * a list of file offset-length pairs, offsets[] and lengths[].
+ */
 static int
 flatten_subarray(int                ndim,       /* number of dimensions */
                  int                el_size,    /* array element size */
@@ -426,7 +600,7 @@ flatten_subarray(int                ndim,       /* number of dimensions */
                  MPI_Count         *offsets,    /* OUT: array of offsets */
                  MPI_Count         *lengths     /* OUT: array of lengths */
 #else
-                 MPI_Aint          *offsets,    /* OUT: array of offsets */
+                 MPI_Offset        *offsets,    /* OUT: array of offsets */
                  int               *lengths     /* OUT: array of lengths */
 #endif
                                      )
@@ -503,12 +677,26 @@ flatten_subarray(int                ndim,       /* number of dimensions */
         subarray_len *= count[ndim];
     }
 
+    /* check if the list can be coalesced */
+    for (i=0, j=1; j<*npairs; j++) {
+        if (offsets[i] + lengths[i] == offsets[j])
+            lengths[i] += lengths[j];
+        else {
+            i++;
+            if (i < j) {
+                offsets[i] = offsets[j];
+                lengths[i] = lengths[j];
+            }
+        }
+    }
+    *npairs = i + 1;
+
     return NC_NOERR;
 }
 
-/*----< flatten_req() >-----------------------------------------------------*/
-/* flatten one write request into offset-length pairs.
- * offsets and lengths are allocated here and need to be freed by the caller
+/*----< flatten_req() >------------------------------------------------------*/
+/* Flatten one subarray request into offset-length pairs. Arrays offsets and
+ * lengths are allocated in this subroutine and need to be freed by the caller.
  */
 static int
 flatten_req(NC                *ncp,
@@ -516,19 +704,29 @@ flatten_req(NC                *ncp,
             const MPI_Offset  *start,
             const MPI_Offset  *count,
             const MPI_Offset  *stride,
+            int               *is_incr,   /* OUT: are offsets incrementing */
             MPI_Aint          *num_pairs, /* OUT: number of off-len pairs */
 #ifdef HAVE_MPI_LARGE_COUNT
-            MPI_Count        **offsets,   /* OUT: array of flattened offsets */
-            MPI_Count        **lengths    /* OUT: array of flattened lengths */
+            MPI_Count        **off_ptr,   /* OUT: array of flattened offsets */
+            MPI_Count        **len_ptr    /* OUT: array of flattened lengths */
 #else
-            MPI_Aint         **offsets,   /* OUT: array of flattened offsets */
-            int              **lengths    /* OUT: array of flattened lengths */
+            MPI_Offset       **off_ptr,   /* OUT: array of flattened offsets */
+            int              **len_ptr    /* OUT: array of flattened lengths */
 #endif
                                    )
 {
-    int j, err=NC_NOERR, ndims;
+    int i, j, err=NC_NOERR, ndims;
     MPI_Aint num, idx;
     MPI_Offset var_begin, *shape, count0, *ones=NULL;
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Count  prev_end_off;
+    MPI_Count *offsets;
+    MPI_Count *lengths;
+#else
+    MPI_Offset  prev_end_off;
+    MPI_Offset *offsets;
+    int        *lengths;
+#endif
 
     *num_pairs = 0;    /* total number of offset-length pairs */
 
@@ -537,15 +735,17 @@ flatten_req(NC                *ncp,
      */
     if (varp->ndims == 0) { /* scalar variable */
 #ifdef HAVE_MPI_LARGE_COUNT
-        *offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count));
-        *lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count));
+        offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * 2);
+        lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * 2);
 #else
-        *offsets = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint));
-        *lengths = (int*)     NCI_Malloc(sizeof(int));
+        offsets = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * 2);
+        lengths = (int*)       NCI_Malloc(sizeof(int) * 2);
 #endif
-        (*offsets)[0] = varp->begin;
-        (*lengths)[0] = varp->xsz;
+        offsets[0] = varp->begin;
+        lengths[0] = varp->xsz;
         *num_pairs = 1;
+        *off_ptr = offsets;
+        *len_ptr = lengths;
         return NC_NOERR;
     }
     else if (varp->ndims == 1 && IS_RECVAR(varp)) { /* scalar variable */
@@ -555,22 +755,24 @@ flatten_req(NC                *ncp,
         num = 1;
         if (stride != NULL && stride[varp->ndims-1] > 1)
             num = count[varp->ndims-1];  /* count of last dimension */
-        for (j=0; j<varp->ndims-1; j++)
-            num *= count[j];       /* all count[] except the last dimension */
+        for (i=0; i<varp->ndims-1; i++)
+            num *= count[i];       /* all count[] except the last dimension */
     }
     *num_pairs = num;
 
 #ifdef HAVE_MPI_LARGE_COUNT
-    *offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num);
-    *lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num);
+    offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (num+1));
+    lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (num+1));
 #else
-    *offsets = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint) * num);
-    *lengths = (int*)     NCI_Malloc(sizeof(int)      * num);
+    offsets = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * (num+1));
+    lengths = (int*)       NCI_Malloc(sizeof(int)        * (num+1));
 #endif
+    *off_ptr = offsets;
+    *len_ptr = lengths;
 
     if (stride == NULL) { /* equivalent to {1, 1, ..., 1} */
         ones = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * varp->ndims);
-        for (j=0; j<varp->ndims; j++) ones[j] = 1;
+        for (i=0; i<varp->ndims; i++) ones[i] = 1;
     }
 
     ndims = varp->ndims;
@@ -589,13 +791,26 @@ flatten_req(NC                *ncp,
         count0 = 1;
 
     idx = 0;
-    for (j=0; j<count0; j++) {
+    *is_incr = 1;
+    prev_end_off = -1;
+    for (i=0; i<count0; i++) {
         /* flatten the request into a list of offset-length pairs */
         err = flatten_subarray(ndims, varp->xsz, var_begin, shape,
                                start, count, (stride == NULL) ? ones : stride,
-                               &num,            /* OUT: num of off-len pairs */
-                               *offsets + idx,  /* OUT: array of offsets */
-                               *lengths + idx); /* OUT: array of lengths */
+                               &num,           /* OUT: num of off-len pairs */
+                               offsets + idx,  /* OUT: array of offsets */
+                               lengths + idx); /* OUT: array of lengths */
+
+        if (num == 0) continue;
+
+        /* check if offsets[] are in an increasing order */
+        for (j=0; j<num; j++) {
+            if (prev_end_off > offsets[idx+j])
+                *is_incr = 0;  /* offsets are not incrementing */
+            else
+                prev_end_off = offsets[idx+j];
+        }
+
         idx += num;
         assert(idx <= *num_pairs);
 
@@ -605,30 +820,46 @@ flatten_req(NC                *ncp,
     if (ones != NULL)
         NCI_Free(ones);
 
+    /* num_pairs may be less than originally calculated, because offset-length
+     * pairs are coalesced in the call to flatten_subarray().
+     */
+    *num_pairs = idx;
+
     return err;
 }
 
 /*----< flatten_reqs() >-----------------------------------------------------*/
-/* flatten all write requests into offset-length pairs.
- * offsets and lengths are allocated here and need to be freed by the caller
+/* Flatten multiple subarray requests into file offset-length pairs. Arrays
+ * offsets and lengths are allocated here and need to be freed by the caller.
  */
 static int
 flatten_reqs(NC            *ncp,
+             int            reqMode,   /* IN: NC_REQ_RD or NC_REQ_WR */
              int            num_reqs,  /* IN: # requests */
              const NC_req  *reqs,      /* [num_reqs] requests */
+             int           *is_incr,   /* OUT: are offsets incrementing */
              MPI_Aint      *num_pairs, /* OUT: total number of off-len pairs */
 #ifdef HAVE_MPI_LARGE_COUNT
-             MPI_Count    **offsets,   /* OUT: array of flattened offsets */
-             MPI_Count    **lengths    /* OUT: array of flattened lengths */
+             MPI_Count    **off_ptr,   /* OUT: array of flattened offsets */
+             MPI_Count    **len_ptr    /* OUT: array of flattened lengths */
 #else
-             MPI_Aint     **offsets,   /* OUT: array of flattened offsets */
-             int          **lengths    /* OUT: array of flattened lengths */
+             MPI_Offset   **off_ptr,   /* OUT: array of flattened offsets */
+             int          **len_ptr    /* OUT: array of flattened lengths */
 #endif
                                    )
 {
     int i, j, status=NC_NOERR, ndims, max_ndims=0;
     MPI_Aint num, idx;
     MPI_Offset *start, *count, *shape, *stride, *ones;
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Count  prev_end_off;
+    MPI_Count *offsets;
+    MPI_Count *lengths;
+#else
+    MPI_Offset  prev_end_off;
+    MPI_Offset *offsets;
+    int        *lengths;
+#endif
 
     *num_pairs = 0;    /* total number of offset-length pairs */
 
@@ -636,57 +867,60 @@ flatten_reqs(NC            *ncp,
      * contiguous memory space for storing off-len pairs
      */
     for (i=0; i<num_reqs; i++) {
-        NC_lead_req *lead = ncp->put_lead_list + reqs[i].lead_off;
-        ndims = lead->varp->ndims;
-        max_ndims = MAX(max_ndims, ndims);
-        if (ndims > 0) {
-            start  = reqs[i].start;
-            count  = start + ndims;
-            stride = count + ndims;
-        }
+        /* reqs[i].npairs is the number of offset-length pairs of this request,
+         * calculated in ncmpio_igetput_varm() and igetput_varn()
+         */
+        *num_pairs += reqs[i].npairs;
+        if (fIsSet(reqMode, NC_REQ_WR))
+            ndims = ncp->put_lead_list[reqs[i].lead_off].varp->ndims;
         else
-            start = count = stride = NULL;
-
-        /* for record variable, each reqs[] is within a record */
-        if (IS_RECVAR(lead->varp)) {
-            ndims--;
-            start++;
-            count++;
-            stride++;
-        }
-        if (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) stride = NULL;
-
-        if (ndims < 0) continue;
-        if (ndims == 0) {  /* 1D record variable */
-            (*num_pairs)++;
-            continue;
-        }
-        num = 1;
-        if (stride != NULL && stride[ndims-1] > 1)
-            num = count[ndims-1];  /* count of last dimension */
-        for (j=0; j<ndims-1; j++)
-            num *= count[j];  /* all count[] except the last dimension */
-
-        (*num_pairs) += num;
+            ndims = ncp->get_lead_list[reqs[i].lead_off].varp->ndims;
+        max_ndims = MAX(max_ndims, ndims);
     }
 
     /* now we can allocate a contiguous memory space for the off-len pairs */
 #ifdef HAVE_MPI_LARGE_COUNT
-    *offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs));
-    *lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs));
+    offsets = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs+1));
+    lengths = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * (*num_pairs+1));
 #else
-    *offsets = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint) * (*num_pairs));
-    *lengths = (int*)     NCI_Malloc(sizeof(int)      * (*num_pairs));
+    offsets = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * (*num_pairs+1));
+    lengths = (int*)       NCI_Malloc(sizeof(int)        * (*num_pairs+1));
 #endif
-    idx = 0;
+    *off_ptr = offsets;
+    *len_ptr = lengths;
 
     ones = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * max_ndims);
     for (i=0; i<max_ndims; i++) ones[i] = 1;
 
+    idx = 0;
+    prev_end_off = -1;
+    *is_incr = 1;
+
     /* now re-run the loop to fill in the off-len pairs */
     for (i=0; i<num_reqs; i++) {
         MPI_Offset var_begin;
-        NC_lead_req *lead = ncp->put_lead_list + reqs[i].lead_off;
+        NC_lead_req *lead;
+        if (fIsSet(reqMode, NC_REQ_WR))
+            lead = ncp->put_lead_list + reqs[i].lead_off;
+        else
+            lead = ncp->get_lead_list + reqs[i].lead_off;
+
+        if (reqs[i].npairs == 1) {
+            /* When reqs[i] contains only one offset-length pair, re-use
+             * reqs[i].offset_start, which has been generated earlier at a call
+             * to ncmpio_intra_node_aggregation_nreqs().
+             */
+            offsets[idx] = reqs[i].offset_start;
+            lengths[idx] = reqs[i].nelems * lead->varp->xsz;
+
+            /* check if offsets[] are in an increasing order */
+            if (prev_end_off > offsets[idx])
+                *is_incr = 0;  /* offsets are not incrementing */
+            else
+                prev_end_off = offsets[idx];
+            idx++;
+            continue;
+        }
 
         ndims = lead->varp->ndims;
         if (ndims > 0) {
@@ -715,20 +949,37 @@ flatten_reqs(NC            *ncp,
 
         if (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) stride = NULL;
 
-        /* flatten each request into a list of offset-length pairs and
-         * append to the end of offsets and lengths
+        /* flatten each request into a list of offset-length pairs and append
+         * to the end of offsets and lengths
          */
         flatten_subarray(ndims, lead->varp->xsz, var_begin, shape,
                          start, count, (stride == NULL) ? ones : stride,
-                         &num,            /* OUT: number of off-len pairs */
-                         *offsets + idx,  /* OUT: array of offsets */
-                         *lengths + idx); /* OUT: array of lengths */
+                         &num,           /* OUT: number of off-len pairs */
+                         offsets + idx,  /* OUT: array of offsets */
+                         lengths + idx); /* OUT: array of lengths */
+
+        /* check if offsets[] are in an increasing order */
+        for (j=0; j<num; j++) {
+            if (prev_end_off > offsets[idx+j])
+                *is_incr = 0;  /* offsets are not incrementing */
+            else
+                prev_end_off = offsets[idx+j];
+        }
         idx += num;
     }
     NCI_Free(ones);
 
+    /* num_pairs may be less than originally calculated, because offset-length
+     * pairs are coalesced in the call to flatten_subarray().
+     */
+    *num_pairs = idx;
+
     for (i=0; i<num_reqs; i++) {
-        NC_lead_req *lead = ncp->put_lead_list + reqs[i].lead_off;
+        NC_lead_req *lead;
+        if (fIsSet(reqMode, NC_REQ_WR))
+            lead = ncp->put_lead_list + reqs[i].lead_off;
+        else
+            lead = ncp->get_lead_list + reqs[i].lead_off;
         if (fIsSet(lead->flag, NC_REQ_TO_FREE)) {
             NCI_Free(lead->start);
             lead->start = NULL;
@@ -738,187 +989,434 @@ flatten_reqs(NC            *ncp,
     return status;
 }
 
-/*----< construct_buf_type() >-----------------------------------------------*/
-/* construct an MPI derived datatype for I/O buffers from the request list, by
- * concatenate all buffers.
+/*----< flat_buf_type() >----------------------------------------------------*/
+/* Scan the nonblocking requests, pointed by reqs, and build the offset-length
+ * pairs of all buffers, xbuf. Note xbuf in each nonblocking request is a
+ * contiguous buffer (packed from the user buffer for the write operations).
+ * For record variables, if a user request is accessing more than one record,
+ * the request is split into into multiple NC_req objects, one for each record.
  */
 static int
-construct_buf_type(const NC     *ncp,
-                   int           num_reqs,  /* IN: # requests */
-                   const NC_req *reqs,      /* [num_reqs] requests */
-                   MPI_Aint     *bufLen,    /* OUT: buffer size in bytes */
-                   MPI_Datatype *bufType)   /* OUT: buffer datatype */
+flat_buf_type(const NC      *ncp,
+              int            reqMode,  /* IN: NC_REQ_RD or NC_REQ_WR */
+              int            num_reqs, /* IN: # requests */
+              const NC_req  *reqs,     /* IN: [num_reqs] requests */
+              PNCIO_View    *buf_view, /* OUT: flattened buftype */
+              void         **buf)      /* OUT: pointer to I/O buffer */
+/* TODO: */
+#if 1
 {
-    int i, err, mpireturn, status=NC_NOERR;
+    int i, j, err=NC_NOERR;
     NC_lead_req *lead;
+    MPI_Aint addr, addr0;
+/* buffer offset should be of type MPI_Aint. length should be size_t. */
+
+    buf_view->type = MPI_BYTE;
+    buf_view->size = 0;
+    buf_view->count = 0;
+    buf_view->off = NULL;
+    buf_view->len = NULL;
+    buf_view->is_contig = 1;
 
+    if (num_reqs == 0)
+        return NC_NOERR;
+
+    buf_view->off = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * num_reqs);
 #ifdef HAVE_MPI_LARGE_COUNT
-    MPI_Count *blocklens = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs);
-    MPI_Count *disps     = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs);
+    buf_view->len = (MPI_Offset*)NCI_Malloc(sizeof(MPI_Offset) * num_reqs);
 #else
-    int       *blocklens = (int*)      NCI_Malloc(sizeof(int)       * num_reqs);
-    MPI_Aint  *disps     = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint)  * num_reqs);
+    buf_view->len = (int*)       NCI_Malloc(sizeof(int)        * num_reqs);
+#endif
+
+#if 1
+    *buf = reqs[0].xbuf;
+
+    lead = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list
+                                        : ncp->get_lead_list;
+
+    MPI_Get_address(lead[reqs[0].lead_off].xbuf, &addr0);
+// printf("%s at %d: lead xbuf=%ld nelems=%lld\n",__func__,__LINE__, addr0,lead[reqs[0].lead_off].nelems);
+
+// assert(reqs[0].xbuf == lead[reqs[0].lead_off].xbuf);
+
+    /* set buf_view->off[0] and buf_view->len[0] */
+    MPI_Get_address(reqs[0].xbuf, &addr0); /* displacement uses MPI_BOTTOM */
+    buf_view->off[0] = 0;
+
+    /* buf_view->len[] are in bytes */
+    buf_view->len[0] = reqs[0].nelems * lead[reqs[0].lead_off].varp->xsz;
+#if 0
+printf("%s at %d: buf_view->len[0]=%lld nelems=%lld\n",__func__,__LINE__, buf_view->len[0],reqs[0].nelems);
+j=0;
+printf("%s at %d: buf_view xbuf=%ld off[%d]=%lld nelems=%lld\n",__func__,__LINE__, addr0,j,buf_view->off[j],reqs[0].nelems);
 #endif
 
-    *bufLen = 0;
-    for (i=0; i<num_reqs; i++) {
-        MPI_Aint addr;
 
+/*
+int *wkl, nelems; char *xbuf;
+j = 0;
+wkl = (int*) malloc(buf_view->len[j]);
+nelems=buf_view->len[j]/4; xbuf = (char*)reqs[j].xbuf + buf_view->off[j];
+memcpy(wkl, xbuf, nelems*4); ncmpii_in_swapn(wkl, nelems, 4);
+printf("%s at %d: nelems=%d off=%lld buf=(%p) ",__func__,__LINE__, nelems, buf_view->off[j], xbuf);
+for (i=0; i<nelems; i++) printf(" %d",wkl[i]);
+printf("\n");
+free(wkl);
+*/
+
+    buf_view->size = buf_view->len[0];
+    for (i=0, j=1; j<num_reqs; j++) {
         /* displacement uses MPI_BOTTOM */
-        MPI_Get_address(reqs[i].xbuf, &addr);
-        disps[i] = addr;
+        MPI_Get_address(reqs[j].xbuf, &addr);
+        buf_view->off[j] = addr - addr0;
 
-        /* blocklens[] in bytes */
-        lead = ncp->put_lead_list + reqs[i].lead_off;
-        blocklens[i] = reqs[i].nelems * lead->varp->xsz;
+// printf("%s at %d: buf_view xbuf=%ld off[%d]=%lld nelems=%lld\n",__func__,__LINE__, addr,j,buf_view->off[j],reqs[j].nelems);
 
-        *bufLen += blocklens[i];
-    }
+// assert(reqs[j].xbuf == lead[reqs[j].lead_off].xbuf);
+        /* buf_view->len[] are in bytes */
+        buf_view->len[j] = reqs[j].nelems * lead[reqs[j].lead_off].varp->xsz;
 
-    /* construct buffer derived datatype */
-#ifdef HAVE_MPI_LARGE_COUNT
-    mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps,
-                                           MPI_BYTE, bufType);
+/*
+wkl = (int*) malloc(buf_view->len[j]);
+nelems=buf_view->len[j]/4;
+xbuf = (char*)reqs[j].xbuf; // + buf_view->off[j];
+xbuf = (char*)(*buf) + buf_view->off[j];
+memcpy(wkl, xbuf, nelems*4); ncmpii_in_swapn(wkl, nelems, 4);
+printf("%s at %d: nelems=%d off=%lld buf=(%p) ",__func__,__LINE__, nelems, buf_view->off[j], xbuf);
+for (i=0; i<nelems; i++) printf(" %d",wkl[i]);
+printf("\n");
+free(wkl);
+*/
+
+        /* accumulate buffer type size */
+        buf_view->size += buf_view->len[j];
+
+        /* coalesce the off-len pairs */
+        if (buf_view->off[i] + buf_view->len[i] == buf_view->off[j])
+            buf_view->len[i] += buf_view->len[j];
+        else {
+            i++;
+            if (i < j) {
+                buf_view->off[i] = buf_view->off[j];
+                buf_view->len[i] = buf_view->len[j];
+            }
+        }
+    }
+    /* After coalescing, the true number of requests may be reduced */
+// printf("%s at %d: buf_view->size=%lld\n",__func__,__LINE__, buf_view->size);
 #else
-    mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps,
-                                         MPI_BYTE, bufType);
-#endif
-    if (mpireturn != MPI_SUCCESS) {
-        err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed");
-        /* return the first encountered error if there is any */
-        if (status == NC_NOERR) status = err;
+    /* set buf_view->off[0] and buf_view->len[0] */
+    MPI_Get_address(reqs[0].xbuf, &addr); /* displacement uses MPI_BOTTOM */
+    buf_view->off[0] = addr;
+
+    lead = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list
+                                        : ncp->get_lead_list;
+
+    /* buf_view->len[] are in bytes */
+    buf_view->len[0] = reqs[0].nelems * lead[reqs[0].lead_off].varp->xsz;
+ ?  *buf = lead[reqs[0].lead_off].xbuf;
+
+    buf_view->size = buf_view->len[0];
+    for (i=0, j=1; j<num_reqs; j++) {
+        /* displacement uses MPI_BOTTOM */
+        MPI_Get_address(reqs[j].xbuf, &addr);
+        buf_view->off[j] = addr;
 
-        *bufType = MPI_DATATYPE_NULL;
+        /* buf_view->len[] are in bytes */
+        buf_view->len[j] = reqs[j].nelems * lead[reqs[j].lead_off].varp->xsz;
+
+        /* accumulate buffer type size */
+        buf_view->size += buf_view->len[j];
+
+        /* coalesce the off-len pairs */
+        if (buf_view->off[i] + buf_view->len[i] == buf_view->off[j])
+            buf_view->len[i] += buf_view->len[j];
+        else {
+            i++;
+            if (i < j) {
+                buf_view->off[i] = buf_view->off[j];
+                buf_view->len[i] = buf_view->len[j];
+            }
+        }
     }
-    else {
-        MPI_Type_commit(bufType);
+    /* After coalescing, the true number of requests may be reduced */
+#endif
+
+    if (i + 1 < num_reqs) {
+        num_reqs = i + 1; /* num_reqs is reduced */
+        buf_view->off = (MPI_Offset*)NCI_Realloc(buf_view->off,
+                                       sizeof(MPI_Offset) * num_reqs);
 #ifdef HAVE_MPI_LARGE_COUNT
-        MPI_Count typeSize;
-        MPI_Type_size_c(*bufType, &typeSize);
+        buf_view->len = (MPI_Offset*)NCI_Realloc(buf_view->len,
+                                       sizeof(MPI_Offset) * num_reqs);
 #else
-        int typeSize;
-        MPI_Type_size(*bufType, &typeSize);
+        buf_view->len = (int*)       NCI_Realloc(buf_view->len,
+                                       sizeof(int)        * num_reqs);
 #endif
-        assert(typeSize == *bufLen);
     }
 
-    NCI_Free(blocklens);
-    NCI_Free(disps);
-
-    return status;
-}
+    buf_view->count = num_reqs;
+    buf_view->is_contig = (num_reqs <= 1);
 
-/*----< intra_node_aggregation() >-------------------------------------------*/
-/* This is a collective call */
-static int
-intra_node_aggregation(NC           *ncp,
-                       MPI_Aint      num_pairs,
+    /* construct buf_view->type if it is noncontiguous */
+    if (num_reqs > 1) {
+        int mpireturn;
 #ifdef HAVE_MPI_LARGE_COUNT
-                       MPI_Count    *offsets,
-                       MPI_Count    *lengths,
+        mpireturn = MPI_Type_create_hindexed_c(num_reqs, buf_view->len,
+                                               buf_view->off, MPI_BYTE,
+                                               &buf_view->type);
+#else
+        MPI_Aint *disp;
+#if SIZEOF_MPI_AINT == SIZEOF_MPI_OFFSET
+        disp = (MPI_Aint*) buf_view->off;
 #else
-                       MPI_Aint     *offsets,
-                       int          *lengths,
+        disp = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * num_reqs);
+        for (j=0; j<num_reqs; j++)
+            disp[j] = (MPI_Aint) buf_view->off[j];
+#endif
+
+        mpireturn = MPI_Type_create_hindexed(num_reqs, buf_view->len, disp,
+                                             MPI_BYTE, &buf_view->type);
+#if SIZEOF_MPI_AINT != SIZEOF_MPI_OFFSET
+        NCI_Free(disp);
+#endif
 #endif
-                       MPI_Offset    bufCount,
-                       MPI_Datatype  bufType,
-                       void         *buf)
+        if (mpireturn != MPI_SUCCESS) {
+            err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed");
+
+            buf_view->type = MPI_BYTE;
+            NCI_Free(buf_view->off);
+            NCI_Free(buf_view->len);
+            buf_view->off = NULL;
+            buf_view->len = NULL;
+            buf_view->count = 0;
+            buf_view->size = 0;
+        }
+        else {
+            MPI_Type_commit(&buf_view->type);
+        }
+    }
+
+    return err;
+}
+#else
 {
-    int i, j, err, mpireturn, status=NC_NOERR, nreqs;
-    char *recv_buf=NULL, *wr_buf = NULL;
-    MPI_Aint npairs=0, *msg;
-    MPI_Offset offset=0, buf_count;
-    MPI_Datatype recvTypes, fileType=MPI_BYTE;
-    MPI_File fh;
-    MPI_Request *req=NULL;
-#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
-    double timing = MPI_Wtime();
+    int i, j, err, mpireturn, status=NC_NOERR;
+    NC_lead_req *lead;
+    MPI_Aint addr;
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Count *disps, *blens;
+#else
+    MPI_Aint *disps;
+    int *blens;
 #endif
+
+    if (num_reqs == 0) {
+        buf_view->type  = MPI_BYTE;
+        buf_view->count = 0;
+        return NC_NOERR;
+    }
+
 #ifdef HAVE_MPI_LARGE_COUNT
-    MPI_Count bufLen;
-    MPI_Type_size_c(bufType, &bufLen);
+    disps = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs);
+    blens = (MPI_Count*)NCI_Malloc(sizeof(MPI_Count) * num_reqs);
 #else
-    int bufLen;
-    MPI_Type_size(bufType, &bufLen);
+    disps = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint)  * num_reqs);
+    blens = (int*)      NCI_Malloc(sizeof(int)       * num_reqs);
 #endif
-    bufLen *= bufCount;
 
-    /* First, tell aggregator how much to receive by sending:
-     * (num_pairs and bufLen). The message size to be sent by this rank
-     * is num_pairs * 2 * sizeof(MPI_Offset) + bufLen
-     */
-    if (ncp->rank == ncp->my_aggr)
-        msg = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * ncp->num_nonaggrs * 2);
-    else
-        msg = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * 2);
+    /* set disps[0] and blens[0] */
+    MPI_Get_address(reqs[0].xbuf, &addr); /* displacement uses MPI_BOTTOM */
+    disps[0] = addr;
 
-    msg[0] = num_pairs;
-    msg[1] = bufLen;
+    lead = (fIsSet(reqMode, NC_REQ_WR)) ? ncp->put_lead_list
+                                        : ncp->get_lead_list;
 
-    /* Aggregator collects each non-aggregator's num_pairs and bufLen */
-    if (ncp->rank == ncp->my_aggr) {
-        req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs);
-        nreqs = 0;
-        for (i=1; i<ncp->num_nonaggrs; i++)
-            MPI_Irecv(msg + i*2, 2, MPI_AINT, ncp->nonaggr_ranks[i], 0,
-                      ncp->comm, &req[nreqs++]);
+    /* blens[] are in bytes */
+    blens[0] = reqs[0].nelems * lead[reqs[0].lead_off].varp->xsz;
+    *buf = lead[reqs[0].lead_off].xbuf;
 
-        mpireturn = MPI_Waitall(nreqs, req, MPI_STATUSES_IGNORE);
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall");
-            /* return the first encountered error if there is any */
-            if (status == NC_NOERR) status = err;
+    for (i=0, j=1; j<num_reqs; j++) {
+        /* displacement uses MPI_BOTTOM */
+        MPI_Get_address(reqs[j].xbuf, &addr);
+        disps[j] = addr;
+
+        /* blens[] are in bytes */
+        blens[j] = reqs[j].nelems * lead[reqs[j].lead_off].varp->xsz;
+
+        /* coalesce the disps-blens pairs */
+        if (disps[i] + blens[i] == disps[j])
+            blens[i] += blens[j];
+        else {
+            i++;
+            if (i < j) {
+                disps[i] = disps[j];
+                blens[i] = blens[j];
+            }
         }
     }
-    else { /* non-aggregator */
-        MPI_Send(msg, 2, MPI_AINT, ncp->my_aggr, 0, ncp->comm);
-        if (num_pairs == 0)
-            NCI_Free(msg);
+
+    if (i + 1 < num_reqs) {
+        num_reqs = i + 1;
+#ifdef HAVE_MPI_LARGE_COUNT
+        disps = (MPI_Count*)NCI_Realloc(disps, sizeof(MPI_Count) * num_reqs);
+        blens = (MPI_Count*)NCI_Realloc(blens, sizeof(MPI_Count) * num_reqs);
+#else
+        disps = (MPI_Aint*) NCI_Realloc(disps, sizeof(MPI_Aint)  * num_reqs);
+        blens = (int*)      NCI_Realloc(blens, sizeof(int)       * num_reqs);
+#endif
     }
 
-    /* Aggregator collects offset-length pairs from non-aggregators */
-    if (ncp->rank == ncp->my_aggr) {
-        /* calculate the total number of offset-length pairs */
-        npairs = num_pairs;
-        for (i=1; i<ncp->num_nonaggrs; i++) npairs += msg[i*2];
+    buf_view->count = num_reqs;
+    buf_view->off   = disps;
+    buf_view->len   = blens;
 
+/* TODO: below datatype construction moves into ncmpio_read_write() */
+    if (num_reqs == 1) {
+#if 1
+buf_view->count = blens[0];
+#endif
+        buf_view->type = MPI_BYTE;
+    }
+    else {
+#if 1
+        /* construct buffer derived datatype */
 #ifdef HAVE_MPI_LARGE_COUNT
-        if (npairs > num_pairs) {
-            /* realloc to store all pairs in a contiguous buffer */
-            offsets = (MPI_Count*) NCI_Realloc(offsets, sizeof(MPI_Count) * npairs);
-            lengths = (MPI_Count*) NCI_Realloc(lengths, sizeof(MPI_Count) * npairs);
-        }
+        mpireturn = MPI_Type_create_hindexed_c(num_reqs, blens, disps,
+                                               MPI_BYTE, &buf_view->type);
 #else
-        if (npairs > num_pairs) {
-            /* realloc to store all pairs in a contiguous buffer */
-            offsets = (MPI_Aint*) NCI_Realloc(offsets, sizeof(MPI_Aint) * npairs);
-            lengths = (int*) NCI_Realloc(lengths, sizeof(int) * npairs);
-        }
+        mpireturn = MPI_Type_create_hindexed(num_reqs, blens, disps,
+                                             MPI_BYTE, &buf_view->type);
 #endif
+        if (mpireturn != MPI_SUCCESS) {
+            err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed");
+            /* return the first encountered error if there is any */
+            if (status == NC_NOERR) status = err;
 
-        nreqs = 0;
-#ifdef HAVE_MPI_LARGE_COUNT
-        MPI_Aint aint;
-        MPI_Count bklens[2];
-        MPI_Count disps[2];
+            buf_view->type = MPI_BYTE;
+            buf_view->count = 0;
+        }
+        else {
+            MPI_Type_commit(&buf_view->type);
+buf_view->count = 1;
+        }
+#endif
+        *buf = NULL; /* buf_view->type is constructed using MPI_BOTTOM */
+    }
 
-        MPI_Get_address(offsets, &aint);
-        disps[0] = MPI_Aint_add(aint, sizeof(MPI_Count) * msg[0]);
-        MPI_Get_address(lengths, &aint);
-        disps[1] = MPI_Aint_add(aint, sizeof(MPI_Count) * msg[0]);
+#if 1
+    NCI_Free(blens);
+    NCI_Free(disps);
+#endif
+    return status;
+}
+#endif
+
+/*----< ina_collect_md() >---------------------------------------------------*/
+/* Within each intra-node aggregation group, the aggregator collects request
+ * metadata from the non-aggregators into meta, including:
+ *   1. the number of offset-length pairs on each non-aggregator
+ *   2. offsets array of each non-aggregator
+ *   3. lengths array of each non-aggregator
+ *   4. npairs is the total number of offset-length pairs of this group.
+ */
+static
+int ina_collect_md(NC          *ncp,
+                   MPI_Aint    *meta,
+#ifdef HAVE_MPI_LARGE_COUNT
+                   MPI_Count  **offsets, /* OUT: may be realloc-ed */
+                   MPI_Count  **lengths, /* OUT: may be realloc-ed */
+#else
+                   MPI_Offset **offsets, /* OUT: may be realloc-ed */
+                   int        **lengths, /* OUT: may be realloc-ed */
+#endif
+                   MPI_Aint    *npairs)  /* OUT: total no. off-len pairs */
+{
+    int i, err, mpireturn, status=NC_NOERR, nreqs;
+    MPI_Request *req=NULL;
+    MPI_Aint num_pairs=meta[0];
+
+    /* Aggregator collects each non-aggregator's num_pairs and bufLen */
+    if (ncp->my_aggr == ncp->rank) {
+
+        req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs);
+        nreqs = 0;
+        for (i=1; i<ncp->num_nonaggrs; i++)
+            TRACE_COMM(MPI_Irecv)(meta + i*3, 3, MPI_AINT,
+                       ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]);
+
+        if (nreqs > 0) {
+#ifdef HAVE_MPI_STATUSES_IGNORE
+            TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE);
+#else
+            MPI_Status *statuses = (MPI_Status *)
+                                   NCI_Malloc(nreqs * sizeof(MPI_Status));
+            TRACE_COMM(MPI_Waitall)(nreqs, req, statuses);
+            NCI_Free(statuses);
+#endif
+            if (mpireturn != MPI_SUCCESS) {
+                err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall");
+                /* return the first encountered error if there is any */
+                if (status == NC_NOERR) status = err;
+            }
+        }
+    }
+    else /* non-aggregator */
+        TRACE_COMM(MPI_Send)(meta, 3, MPI_AINT, ncp->my_aggr, 0, ncp->comm);
+
+    /* Secondly, aggregators collect offset-length pairs from all its
+     * non-aggregators
+     */
+    if (ncp->my_aggr == ncp->rank) {
+        MPI_Datatype recvType;
+
+        /* calculate the total number of offset-length pairs to receive */
+        for (*npairs=0, i=0; i<ncp->num_nonaggrs; i++) *npairs += meta[i*3];
+
+        /* offsets and lengths have been allocated for storing this rank's
+         * offsets and lengths, realloc them to receive offsets and lengths
+         * from non-aggregators so they can be in a contiguous buffer.
+         */
+#ifdef HAVE_MPI_LARGE_COUNT
+        if (*npairs > num_pairs) {
+            *offsets = (MPI_Count*) NCI_Realloc(*offsets, *npairs * sizeof(MPI_Count));
+            *lengths = (MPI_Count*) NCI_Realloc(*lengths, *npairs * sizeof(MPI_Count));
+        }
+#else
+        if (*npairs > num_pairs) {
+            /* realloc to store all pairs in a contiguous buffer */
+            *offsets = (MPI_Offset*) NCI_Realloc(*offsets, *npairs * sizeof(MPI_Offset));
+            *lengths = (int*)        NCI_Realloc(*lengths, *npairs * sizeof(int));
+        }
+#endif
+
+        /* To minimize number of MPI recv calls per non-aggregator, below
+         * creates a derived datatype, recvType, to combine offsets and lengths
+         * into one MPI_Irecv call.
+         */
+        nreqs = 0;
+#ifdef HAVE_MPI_LARGE_COUNT
+        MPI_Aint aint;
+        MPI_Count bklens[2];
+        MPI_Count disps[2];
+
+        MPI_Get_address(*offsets, &aint);
+        disps[0] = MPI_Aint_add(aint, sizeof(MPI_Count) * meta[0]);
+        MPI_Get_address(*lengths, &aint);
+        disps[1] = MPI_Aint_add(aint, sizeof(MPI_Count) * meta[0]);
         for (i=1; i<ncp->num_nonaggrs; i++) {
-            if (msg[i*2] == 0) continue;
-            bklens[0] = msg[i*2] * sizeof(MPI_Count);
-            bklens[1] = msg[i*2] * sizeof(MPI_Count);
+            if (meta[i*3] == 0) continue;
+            bklens[0] = meta[i*3] * sizeof(MPI_Count);
+            bklens[1] = meta[i*3] * sizeof(MPI_Count);
             mpireturn = MPI_Type_create_hindexed_c(2, bklens, disps, MPI_BYTE,
-                                                   &recvTypes);
+                                                   &recvType);
             if (mpireturn != MPI_SUCCESS) {
                 err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed_c");
                 /* return the first encountered error if there is any */
                 if (status == NC_NOERR) status = err;
             }
             else {
-                mpireturn = MPI_Type_commit(&recvTypes);
+                mpireturn = MPI_Type_commit(&recvType);
                 if (mpireturn != MPI_SUCCESS) {
                     err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit");
                     /* return the first encountered error if there is any */
@@ -926,35 +1424,34 @@ intra_node_aggregation(NC           *ncp,
                 }
             }
             /* post to receive offset-length pairs from non-aggregators */
-            MPI_Irecv_c(MPI_BOTTOM, 1, recvTypes, ncp->nonaggr_ranks[i],
-                        0, ncp->comm, &req[nreqs]);
-            MPI_Type_free(&recvTypes);
+            TRACE_COMM(MPI_Irecv_c)(MPI_BOTTOM, 1, recvType,
+                       ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]);
+            MPI_Type_free(&recvType);
 
             disps[0] = MPI_Aint_add(disps[0], bklens[0]);
             disps[1] = MPI_Aint_add(disps[1], bklens[1]);
-            nreqs++;
         }
 #else
         int bklens[2];
         MPI_Aint aint, disps[2];
 
-        MPI_Get_address(offsets, &aint);
-        disps[0] = MPI_Aint_add(aint, sizeof(MPI_Aint) * msg[0]);
-        MPI_Get_address(lengths, &aint);
-        disps[1] = MPI_Aint_add(aint, sizeof(int) * msg[0]);
+        MPI_Get_address(*offsets, &aint);
+        disps[0] = MPI_Aint_add(aint, sizeof(MPI_Offset) * meta[0]);
+        MPI_Get_address(*lengths, &aint);
+        disps[1] = MPI_Aint_add(aint, sizeof(int) * meta[0]);
         for (i=1; i<ncp->num_nonaggrs; i++) {
-            if (msg[i*2] == 0) continue;
-            bklens[0] = msg[i*2] * sizeof(MPI_Aint);
-            bklens[1] = msg[i*2] * sizeof(int);
+            if (meta[i*3] == 0) continue;
+            bklens[0] = meta[i*3] * sizeof(MPI_Offset);
+            bklens[1] = meta[i*3] * sizeof(int);
             mpireturn = MPI_Type_create_hindexed(2, bklens, disps, MPI_BYTE,
-                                                 &recvTypes);
+                                                 &recvType);
             if (mpireturn != MPI_SUCCESS) {
                 err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed");
                 /* return the first encountered error if there is any */
                 if (status == NC_NOERR) status = err;
             }
             else {
-                mpireturn = MPI_Type_commit(&recvTypes);
+                mpireturn = MPI_Type_commit(&recvType);
                 if (mpireturn != MPI_SUCCESS) {
                     err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit");
                     /* return the first encountered error if there is any */
@@ -962,368 +1459,1369 @@ intra_node_aggregation(NC           *ncp,
                 }
             }
             /* post to receive offset-length pairs from non-aggregators */
-            MPI_Irecv(MPI_BOTTOM, 1, recvTypes, ncp->nonaggr_ranks[i],
-                      0, ncp->comm, &req[nreqs]);
-            MPI_Type_free(&recvTypes);
+            TRACE_COMM(MPI_Irecv)(MPI_BOTTOM, 1, recvType,
+                       ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]);
+            MPI_Type_free(&recvType);
 
             disps[0] = MPI_Aint_add(disps[0], bklens[0]);
             disps[1] = MPI_Aint_add(disps[1], bklens[1]);
-            nreqs++;
         }
 #endif
-        mpireturn = MPI_Waitall(nreqs, req, MPI_STATUSES_IGNORE);
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall");
-            /* return the first encountered error if there is any */
-            if (status == NC_NOERR) status = err;
+        if (nreqs > 0) {
+#ifdef HAVE_MPI_STATUSES_IGNORE
+            TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE);
+#else
+            MPI_Status *statuses = (MPI_Status *)
+                                   NCI_Malloc(nreqs * sizeof(MPI_Status));
+            TRACE_COMM(MPI_Waitall)(nreqs, req, statuses);
+            NCI_Free(statuses);
+#endif
+            if (mpireturn != MPI_SUCCESS) {
+                err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall");
+                /* return the first encountered error if there is any */
+                if (status == NC_NOERR) status = err;
+            }
         }
+        NCI_Free(req);
     }
     else if (num_pairs > 0) { /* non-aggregator */
-        /* send offset-length pairs data to the aggregator */
+        /* To minimize number of MPI send calls to the aggregator, below
+         * creates a derived datatype, sendType, to combine offsets and lengths
+         * into one MPI_Send call.
+         */
+        MPI_Datatype sendType;
+
 #ifdef HAVE_MPI_LARGE_COUNT
         MPI_Aint aint;
         MPI_Count bklens[2];
         MPI_Count disps[2];
 
-        bklens[0] = msg[0] * sizeof(MPI_Count);
+        bklens[0] = meta[0] * sizeof(MPI_Count);
         bklens[1] = bklens[0];
-        MPI_Get_address(offsets, &aint);
+        MPI_Get_address(*offsets, &aint);
         disps[0] = aint;
-        MPI_Get_address(lengths, &aint);
+        MPI_Get_address(*lengths, &aint);
         disps[1] = aint;
         mpireturn = MPI_Type_create_hindexed_c(2, bklens, disps, MPI_BYTE,
-                                               &recvTypes);
+                                               &sendType);
         if (mpireturn != MPI_SUCCESS) {
             err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed_c");
             /* return the first encountered error if there is any */
             if (status == NC_NOERR) status = err;
         }
         else {
-            mpireturn = MPI_Type_commit(&recvTypes);
+            mpireturn = MPI_Type_commit(&sendType);
             if (mpireturn != MPI_SUCCESS) {
                 err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit");
                 /* return the first encountered error if there is any */
                 if (status == NC_NOERR) status = err;
             }
         }
-        MPI_Send_c(MPI_BOTTOM, 1, recvTypes, ncp->my_aggr, 0, ncp->comm);
-        MPI_Type_free(&recvTypes);
+        TRACE_COMM(MPI_Send_c)(MPI_BOTTOM, 1, sendType, ncp->my_aggr, 0,
+                               ncp->comm);
+        MPI_Type_free(&sendType);
 #else
         int bklens[2];
         MPI_Aint disps[2];
 
-        bklens[0] = msg[0] * sizeof(MPI_Aint);
-        bklens[1] = msg[0] * sizeof(int);
-        MPI_Get_address(offsets, &disps[0]);
-        MPI_Get_address(lengths, &disps[1]);
+        bklens[0] = meta[0] * sizeof(MPI_Aint);
+        bklens[1] = meta[0] * sizeof(int);
+        MPI_Get_address(*offsets, &disps[0]);
+        MPI_Get_address(*lengths, &disps[1]);
         mpireturn = MPI_Type_create_hindexed(2, bklens, disps, MPI_BYTE,
-                                             &recvTypes);
+                                             &sendType);
         if (mpireturn != MPI_SUCCESS) {
             err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed");
             /* return the first encountered error if there is any */
             if (status == NC_NOERR) status = err;
         }
         else {
-            mpireturn = MPI_Type_commit(&recvTypes);
+            mpireturn = MPI_Type_commit(&sendType);
             if (mpireturn != MPI_SUCCESS) {
                 err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit");
                 /* return the first encountered error if there is any */
                 if (status == NC_NOERR) status = err;
             }
         }
-        MPI_Send(MPI_BOTTOM, 1, recvTypes, ncp->my_aggr, 0, ncp->comm);
-        MPI_Type_free(&recvTypes);
+        TRACE_COMM(MPI_Send)(MPI_BOTTOM, 1, sendType, ncp->my_aggr, 0,
+                             ncp->comm);
+        MPI_Type_free(&sendType);
 #endif
-        NCI_Free(msg);
     }
 
-    /*
-     * TODO, define a datatype to combine sends of offset-length pairs with the
-     * write data into a single send call.
-     */
-    nreqs = 0;
-    if (ncp->rank == ncp->my_aggr) {
-        /* calculate the total write account */
-        buf_count = bufLen;
-        for (i=1; i<ncp->num_nonaggrs; i++) buf_count += msg[i*2 + 1];
-
-        /* Allocate receive buffer, which will be sorted into an increasing
-         * order based on the file offsets. Thus, after sorting pack recv_buf
-         * to wr_buf to avoid creating another buffer datatype.
-         */
-        if (buf_count > 0) {
-            recv_buf = (char*) NCI_Malloc(buf_count);
-            wr_buf = (char*) NCI_Malloc(buf_count);
-        }
+    return status;
+}
 
-        /* First, pack self write data into front of the recv_buf */
-        if (bufLen > 0) {
-            if (bufType == MPI_BYTE)
-                memcpy(recv_buf, buf, bufLen);
-            else {
-                void *inbuf = (buf == NULL) ? MPI_BOTTOM : buf;
+/*----< ina_put() >----------------------------------------------------------*/
+/* This subroutine implements the intra-node aggregation for write operations.
+ */
+static
+int ina_put(NC         *ncp,
+            int         is_incr,   /* if offsets are incremental */
+            MPI_Aint    num_pairs, /* number of offset-length pairs */
 #ifdef HAVE_MPI_LARGE_COUNT
-                MPI_Count position=0;
-                MPI_Count incount = (buf == NULL) ? 1 : bufCount;
-                MPI_Pack_c(inbuf, incount, bufType, recv_buf, bufLen, &position,
-                           MPI_COMM_SELF);
+            MPI_Count  *offsets,
+            MPI_Count  *lengths,
 #else
-                int position=0;
-                int incount = (buf == NULL) ? 1 : bufCount;
-                MPI_Pack(inbuf, incount, bufType, recv_buf, bufLen, &position,
-                         MPI_COMM_SELF);
+            MPI_Offset *offsets,
+            int        *lengths,
 #endif
-            }
+            PNCIO_View  buf_view,
+            void       *buf)       /* user buffer */
+{
+    int i, j, err, mpireturn, status=NC_NOERR;
+    char *recv_buf=NULL, *wr_buf = NULL;
+    MPI_Aint npairs=0, *meta=NULL, *count=NULL;
+    MPI_Offset wr_amnt=0;
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Count *off_ptr, *len_ptr;
+#else
+    MPI_Offset *off_ptr;
+    int *len_ptr;
+#endif
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    double endT, startT = MPI_Wtime();
+    MPI_Offset mem_max;
+    ncmpi_inq_malloc_size(&mem_max);
+    ncp->maxmem_put[0] = MAX(ncp->maxmem_put[0], mem_max);
+#endif
+
+    /* buf may be noncontiguous ! */
+
+    /* Firstly, aggregators collect metadata from non-aggregators.
+     *
+     * This rank tells its aggregator how much metadata to receive from this
+     * rank, by sending: the number of offset-length pairs (num_pairs) and user
+     * buffer size in bytes (buf_view.size). This message size to be sent by
+     * this rank is 3 MPI_Offset.
+     */
+    if (ncp->rank == ncp->my_aggr)
+        meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * ncp->num_nonaggrs * 3);
+    else
+        meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * 3);
+
+    meta[0] = num_pairs;
+    meta[1] = buf_view.size;
+    meta[2] = is_incr;
+
+    /* Each aggregator first collects metadata about its offset-length pairs,
+     * size of write request, and whether the offsets are in an incremental
+     * order. The aggregator will gather these metadata from non-aggregators
+     * assigned to it.
+     * For write operation, keeping the original offset-length pairs is not
+     * necessary, as they will later be sorted and coalesced before calling
+     * MPI-IO or PNCIO file write.
+     *
+     * Once ina_collect_md() returns, this aggregator's offsets and lengths may
+     * grow to include the ones from non-aggregators (appended).
+     */
+    if (ncp->num_nonaggrs > 1) {
+        err = ina_collect_md(ncp, meta, &offsets, &lengths, &npairs);
+        if (err != NC_NOERR) {
+            NCI_Free(meta);
+            return err;
         }
+    }
+    else
+        npairs = num_pairs;
 
-        /* post requests to receive write data from non-aggregators */
-        if (buf_count > 0) {
-            char *ptr = recv_buf + bufLen;
-            for (i=1; i<ncp->num_nonaggrs; i++) {
-                if (msg[i*2 + 1] == 0) continue;
+    /* For write operation, the non-aggregators now can start sending their
+     * write data to the aggregator.
+     */
+    if (ncp->rank != ncp->my_aggr) { /* non-aggregator */
+        if (meta[0] > 0) {
+            /* Non-aggregators send write data to the aggregator */
 #ifdef HAVE_MPI_LARGE_COUNT
-                MPI_Irecv_c(ptr, msg[i*2 + 1], MPI_BYTE, ncp->nonaggr_ranks[i],
-                            0, ncp->comm, &req[nreqs++]);
+            MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1;
+            TRACE_COMM(MPI_Send_c)(buf, num, buf_view.type, ncp->my_aggr,
+                                   0, ncp->comm);
 #else
-                MPI_Irecv(ptr, msg[i*2 + 1], MPI_BYTE, ncp->nonaggr_ranks[i],
-                          0, ncp->comm, &req[nreqs++]);
+            int num = (buf_view.is_contig) ? buf_view.size : 1;
+            TRACE_COMM(MPI_Send)(buf, num, buf_view.type, ncp->my_aggr,
+                                 0, ncp->comm);
 #endif
-                ptr += msg[i*2 + 1];
-            }
-            mpireturn = MPI_Waitall(nreqs, req, MPI_STATUSES_IGNORE);
-            if (mpireturn != MPI_SUCCESS) {
-                err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall");
-                /* return the first encountered error if there is any */
-                if (status == NC_NOERR) status = err;
-            }
         }
-        NCI_Free(req);
-        NCI_Free(msg);
+
+        /* Must free offsets and lengths now, as they may be realloc-ed in
+         * ina_collect_md()
+         */
+        if (offsets != NULL) NCI_Free(offsets);
+        if (lengths != NULL) NCI_Free(lengths);
+
+        /* Non-aggregators are done here, as only aggregators call MPI-IO/PNCIO
+         * functions to write data to the file. Non-aggregators do not
+         * participate MPI-IO calls.
+         */
+        NCI_Free(meta);
+        return status;
     }
-    else if (bufLen > 0) {
-        /* send write data to the aggregator */
-        void *buf_ptr = (buf == NULL) ? MPI_BOTTOM : buf;
+
+    /* The remaining of this subroutine is for aggregators only */
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    ncmpi_inq_malloc_size(&mem_max);
+    ncp->maxmem_put[1] = MAX(ncp->maxmem_put[1], mem_max);
+    endT = MPI_Wtime();
+    if (ncp->rank == ncp->my_aggr) ncp->ina_time_put[0] += endT - startT;
+    startT = endT;
+#endif
+
+    off_ptr = offsets;
+    len_ptr = lengths;
+
+    /* MPI-IO has the following requirements about filetype.
+     * 1. The (flattened) displacements (of a filetype) are not required to be
+     *    distinct, but they cannot be negative, and they must be monotonically
+     *    non-decreasing.
+     * 2. If the file is opened for writing, neither the etype nor the filetype
+     *    is permitted to contain overlapping regions.
+     */
+    if (npairs > 0) {
+        /* Now this aggregator has received all offset-length pairs from its
+         * non-aggregators. At first, check if a sorting is necessary.
+         */
+        char *ptr;
+        int nreqs, indv_sorted, do_sort, overlap;
+        MPI_Request *req=NULL;
+        MPI_Offset recv_amnt;
+
+        /* check if offsets of all non-aggregators are individual sorted */
+        indv_sorted = 1;
+        do_sort = 0;
+        for (i=-1,j=0; j<ncp->num_nonaggrs; j++) {
+            if (i == -1 && meta[j*3] > 0) /* find 1st whose num_pairs > 0 */
+                i = j;
+            if (meta[j*3+2] == 0) { /* j's offsets are not sorted */
+                indv_sorted = 0;
+                do_sort = 1;
+                break;
+            }
+        }
+        /* i is the first non-aggregator whose num_pairs > 0, and
+         * j is the first non-aggregator whose is_incr is false
+         */
+// printf("%s at %d: do_sort=%d indv_sorted=%d\n",__func__,__LINE__, do_sort,indv_sorted);
+
+        if (i >= 0 && indv_sorted == 1) {
+            /* When all ranks' offsets are individually sorted, we still need
+             * to check if offsets are interleaved among all non-aggregators to
+             * determine whether a sort for all offset-length pairs is
+             * necessary.
+             */
 #ifdef HAVE_MPI_LARGE_COUNT
-        MPI_Count num = (buf == NULL) ? 1 : bufCount;
-        MPI_Send_c(buf_ptr, num, bufType, ncp->my_aggr, 0, ncp->comm);
+            MPI_Count prev_end_off;
 #else
-        int num = (buf == NULL) ? 1 : bufCount;
-        MPI_Send(buf_ptr, num, bufType, ncp->my_aggr, 0, ncp->comm);
+            MPI_Offset prev_end_off;
 #endif
-        NCI_Free(offsets);
-        NCI_Free(lengths);
-    }
+            assert(meta[i*3+2] == 1);
+
+            MPI_Aint sum = meta[i*3];
+            prev_end_off = off_ptr[sum-1]; /* last offset of non-aggregator i */
 
-    /* aggregator sorts the offset-length pairs, along with the buffer */
-    if (ncp->rank == ncp->my_aggr && npairs > 0) {
+            /* check if the offsets are interleaved */
+            for (++i; i<ncp->num_nonaggrs; i++) {
+                if (meta[i*3] == 0) /* zero-sized request */
+                    continue;
+                assert(meta[i*3+2] == 1);
 
-        /* construct array of buffer addresses */
+                if (prev_end_off > off_ptr[sum]) {
+                    /* off_ptr[sum] is the non-aggregator i' 1st offset */
+                    do_sort = 1; /* offsets are not incrementing */
+                    break;
+                }
+                /* move on to next non-aggregator */
+                sum += meta[i*3];
+                prev_end_off = off_ptr[sum-1];
+            }
+        }
+
+        if (do_sort && indv_sorted) {
+            /* Interleaved offsets are found but individual offsets are already
+             * sorted. In this case, heap_merge() is called to merge all
+             * offsets into one single sorted offset list. Note count[] is
+             * initialized and will be used in heap_merge()
+             */
+            count = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) *ncp->num_nonaggrs);
+            for (i=0; i<ncp->num_nonaggrs; i++) count[i] = meta[i*3];
+        }
+
+        /* Construct an array of buffer addresses containing a mapping of the
+         * buffer used to receive write data from non-aggregators and the
+         * buffer used to write to file. bufAddr[] is calculated based on the
+         * assumption that the write buffer is contiguous.
+         */
         MPI_Aint *bufAddr = (MPI_Aint*)NCI_Malloc(sizeof(MPI_Aint) * npairs);
         bufAddr[0] = 0;
         for (i=1; i<npairs; i++)
-            bufAddr[i] = bufAddr[i-1] + lengths[i-1];
+            bufAddr[i] = bufAddr[i-1] + len_ptr[i-1];
 
-        /* sort offsets, lengths, bufAddr altogether, based on offsets into
-         * an increasing order
-         */
-        qsort_off_len_buf(npairs, offsets, lengths, bufAddr);
+        if (do_sort) {
+            /* Sort offsets, lengths, bufAddr altogether, based on offsets into
+             * an increasing order.
+             */
+            if (indv_sorted) {
+                /* heap-merge() runs much faster than qsort() when individual
+                 * lists have already been sorted. However, it has a much
+                 * bigger memory footprint.
+                 */
+                heap_merge(ncp->num_nonaggrs, count, npairs, off_ptr, len_ptr,
+                           bufAddr);
+                NCI_Free(count);
+            }
+            else
+                /* When some individual offsets are not sorted, we cannot use
+                 * heap_merge(). Note qsort() is an in-place sorting.
+                 */
+                qsort_off_len_buf(npairs, off_ptr, len_ptr, bufAddr);
+        }
+// printf("%s at %d: do_sort=%d indv_sorted=%d\n",__func__,__LINE__, do_sort,indv_sorted);
 
-        /* merge the overlapped buffer segments, skip the overlapped regions
-         * for those with higher j indices (i.e. requests with lower j indices
-         * win the writes to the overlapped regions)
+        /* Now off_ptr and len_ptr are sorted, but overlaps may exist between
+         * adjacent pairs. If this is the case, they must be coalesced.
+         *
+         * Below loop checks if there is overlap and calculates recv_amnt and
+         * wr_amnt.
+         * recv_amnt is the total amount this aggregator will receive from
+         *     non-aggregators, including self. recv_amnt includes overlaps.
+         * wr_amnt is recv_amnt with overlap removed.
+         *
+         * This loop also coalesces offset-length pairs as well as the
+         * corresponding buffer addresses, so they can be used to move write
+         * data around in the true write buffer.
          */
+        overlap = 0;
+int fake_overlap=0;
+        wr_amnt = recv_amnt = len_ptr[0];
         for (i=0, j=1; j<npairs; j++) {
-            if (offsets[i] + lengths[i] >= offsets[j] + lengths[j])
+            recv_amnt += len_ptr[j];
+            if (off_ptr[i] + len_ptr[i] >= off_ptr[j] + len_ptr[j]) {
+                overlap = 1;
+fake_overlap=1;
                 /* segment i completely covers segment j, skip j */
                 continue;
+            }
 
-            MPI_Offset gap = offsets[i] + lengths[i] - offsets[j];
-            if (gap >= 0) { /* segments i and j overlaps */
-                if (bufAddr[i] + lengths[i] == bufAddr[j] + gap) {
-                    /* buffers i and j are contiguous, merge j to i */
-                    lengths[i] += lengths[j] - gap;
+            MPI_Offset gap = off_ptr[i] + len_ptr[i] - off_ptr[j];
+            if (gap >= 0) { /* overlap detected, merge j into i */
+                /* when gap > 0,  pairs i and j overlap
+                 * when gap == 0, pairs i and j are contiguous
+                 */
+                if (gap > 0) overlap = 1;
+if (gap >= 0) fake_overlap=1;
+                wr_amnt += len_ptr[j] - gap;
+                if (bufAddr[i] + len_ptr[i] == bufAddr[j] + gap) {
+                    /* buffers i and j are contiguous, merge j into i */
+                    len_ptr[i] += len_ptr[j] - gap;
                 }
                 else { /* buffers are not contiguous, reduce j's len */
-                    offsets[i+1] = offsets[j] + gap;
-                    lengths[i+1] = lengths[j] - gap;
+                    off_ptr[i+1] = off_ptr[j] + gap;
+                    len_ptr[i+1] = len_ptr[j] - gap;
                     bufAddr[i+1] = bufAddr[j] + gap;
                     i++;
                 }
             }
             else { /* i and j do not overlap */
+                wr_amnt += len_ptr[j];
                 i++;
                 if (i < j) {
-                    offsets[i] = offsets[j];
-                    lengths[i] = lengths[j];
+                    off_ptr[i] = off_ptr[j];
+                    len_ptr[i] = len_ptr[j];
                     bufAddr[i] = bufAddr[j];
                 }
             }
         }
-        /* update number of pairs, now all off-len pairs are not overlapped */
+/*
+if (ncp->num_nonaggrs == 1 && do_sort == 1) printf("%s at %d: overlap=%d do_sort=%d after coalesce npairs changed from %ld to %d wr_amnt=%lld recv_amnt=%lld\n",__func__,__LINE__, overlap, do_sort,npairs,i+1,wr_amnt,recv_amnt);
+*/
+
+if (fake_overlap == 0) assert(npairs == i+1);
+
+        /* Now off_ptr[], len_ptr[], bufAddr[] are coalesced and no overlap */
         npairs = i+1;
 
-        /* pack recv_buf, data received from non-aggregators, into wr_buf, a
-         * contiguous buffer, wr_buf, which will later be used in a call to
-         * MPI_File_write_at_all()
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        ncmpi_inq_malloc_size(&mem_max);
+        ncp->maxmem_put[2] = MAX(ncp->maxmem_put[2], mem_max);
+
+        endT = MPI_Wtime();
+        ncp->ina_time_put[1] += endT - startT;
+        ncp->ina_npairs_put = MAX(ncp->ina_npairs_put, npairs);
+        startT = endT;
+#endif
+
+        /* Allocate receive buffer. Once write data from non-aggregators have
+         * received into recv_buf, it is packed into wr_buf. Then, wr_buf is
+         * used to call MPI-IO/PNCIO file write. Note the wr_buf is always
+         * contiguous.
+         *
+         * When ncp->num_nonaggrs == 1, wr_buf is set to buf which is directly
+         * passed to MPI-IO/PNCIO file write.
+         *
+         * If file offset-length pairs have not been re-ordered, i.e. sorted
+         * and overlaps removed, and this aggregator will not receive any write
+         * data from its non-aggregators, then we can use user's buffer, buf,
+         * to call MPI-IO/PNCIO to write to the file, without allocating an
+         * additional temporary buffer.
+         */
+        if (!do_sort && buf_view.size == recv_amnt && !overlap)
+            recv_buf = buf;
+        else
+            recv_buf = (char*) NCI_Malloc(recv_amnt);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        ncmpi_inq_malloc_size(&mem_max);
+        ncp->maxmem_put[3] = MAX(ncp->maxmem_put[3], mem_max);
+#endif
+
+        if (recv_buf != buf) {
+            /* Pack this aggregator's write data into front of recv_buf */
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Count pos=0;
+            MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1;
+            MPI_Pack_c(buf, num, buf_view.type, recv_buf, buf_view.size, &pos,
+                       MPI_COMM_SELF);
+#else
+            int pos=0;
+            MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1;
+            MPI_Pack(buf, num, buf_view.type, recv_buf, buf_view.size, &pos,
+                     MPI_COMM_SELF);
+#endif
+        }
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        endT = MPI_Wtime();
+        ncp->ina_time_put[2] += endT - startT;
+        startT = endT;
+#endif
+
+        /* Receive write data sent from non-aggregators. Note we cannot move
+         * the posting of MPI_Irecv calls to before sorting and leave
+         * MPI_Waitall() to after sorting to overlap communication with the
+         * sorting, because the sorting determines the receive buffer size.
          */
-        char *ptr = wr_buf;
-        buf_count = 0;
-        if (npairs > 0) {
-            memcpy(ptr, recv_buf + bufAddr[0], lengths[0]);
-            ptr += lengths[0];
-            buf_count = lengths[0];
+        req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs);
+        ptr = recv_buf + buf_view.size;
+        nreqs = 0;
+        for (i=1; i<ncp->num_nonaggrs; i++) {
+            if (meta[i*3 + 1] == 0) continue;
+#ifdef HAVE_MPI_LARGE_COUNT
+            TRACE_COMM(MPI_Irecv_c)(ptr, meta[i*3 + 1], MPI_BYTE,
+                           ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]);
+#else
+            TRACE_COMM(MPI_Irecv)(ptr, meta[i*3 + 1], MPI_BYTE,
+                           ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]);
+#endif
+            ptr += meta[i*3 + 1];
         }
-        for (i=0, j=1; j<npairs; j++) {
-            memcpy(ptr, recv_buf + bufAddr[j], lengths[j]);
-            ptr += lengths[j];
-            /* overlap may be found, recalculate buf_count */
-            buf_count += lengths[j];
-
-            /* coalesce the offset-length pairs */
-            if (offsets[i] + lengths[i] == offsets[j]) {
-                /* coalesce j into i */
-                lengths[i] += lengths[j];
+
+        if (nreqs > 0) {
+#ifdef HAVE_MPI_STATUSES_IGNORE
+            TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE);
+#else
+            MPI_Status *statuses = (MPI_Status *)
+                                   NCI_Malloc(nreqs * sizeof(MPI_Status));
+            TRACE_COMM(MPI_Waitall)(nreqs, req, statuses);
+            NCI_Free(statuses);
+#endif
+            if (mpireturn != MPI_SUCCESS) {
+                err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall");
+                /* return the first encountered error if there is any */
+                if (status == NC_NOERR) status = err;
             }
-            else {
-                i++;
-                if (i < j) {
-                    offsets[i] = offsets[j];
-                    lengths[i] = lengths[j];
-                }
+        }
+        NCI_Free(req);
+
+        /* Now all write data has been collected into recv_buf. In case of any
+         * overlap, we must coalesce recv_buf into wr_buf using off_ptr[],
+         * len_ptr[], and bufAddr[]. For overlapped regions, requests with
+         * lower j indices win the writes to the overlapped regions.
+         *
+         * In case the user buffer, buf, can not be used to write to the file,
+         * loop below packs recv_buf, data received from non-aggregators, into
+         * wr_buf, a contiguous buffer, wr_buf, which will later be used in a
+         * call to MPI-IO/PNCIO file write.
+         */
+        if (!do_sort && wr_amnt == recv_amnt)
+            wr_buf = recv_buf;
+        else {
+            /* do_sort means buffer's offsets and lengths have been moved
+             * around in order to make file offset-length pairs monotonically
+             * non-decreasing. We need to copy write data into a temporary
+             * buffer, wr_buf, and write it to the file.
+             */
+            wr_buf = NCI_Malloc(wr_amnt);
+            ptr = wr_buf;
+
+            for (j=0; j<npairs; j++) {
+                memcpy(ptr, recv_buf + bufAddr[j], len_ptr[j]);
+                ptr += len_ptr[j];
             }
+
+            if (recv_buf != buf) NCI_Free(recv_buf);
         }
+
         NCI_Free(bufAddr);
-        if (recv_buf != NULL) NCI_Free(recv_buf);
+    } /* if (npairs > 0) */
 
-        /* update number of pairs, now all off-len pairs are not overlapped */
-        npairs = i+1;
+    NCI_Free(meta);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    endT = MPI_Wtime();
+    if (ncp->rank == ncp->my_aggr) ncp->ina_time_put[3] += endT - startT;
+#endif
+
+    /* set the fileview */
+    err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, npairs, off_ptr, len_ptr);
+    if (err != NC_NOERR) {
+        if (status == NC_NOERR) status = err;
+        wr_amnt = 0;
+    }
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    ncmpi_inq_malloc_size(&mem_max);
+    ncp->maxmem_put[4] = MAX(ncp->maxmem_put[4], mem_max);
+#endif
+
+    if (wr_buf != buf) {
+        /* If write data has been packed in wr_buf, a contiguous buffer,
+         * buf_view must be updated before passing it to the MPI-IO/PNCIO file
+         * write.
+         */
+        buf_view.size = wr_amnt;
+        buf_view.type = MPI_BYTE;
+        buf_view.is_contig = 1;
+    }
+    /* else case is when the user's buffer, buf, can be used to write */
+
+    /* carry out write request to file */
+    err = ncmpio_read_write(ncp, NC_REQ_WR, 0, buf_view, wr_buf);
+    if (status == NC_NOERR) status = err;
+
+    if (wr_buf != buf)  NCI_Free(wr_buf);
+
+    /* Must free offsets and lengths now, as they may be realloc-ed in
+     * ina_collect_md()
+     */
+    if (offsets != NULL) NCI_Free(offsets);
+    if (lengths != NULL) NCI_Free(lengths);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    ncmpi_inq_malloc_size(&mem_max);
+    ncp->maxmem_put[5] = MAX(ncp->maxmem_put[5], mem_max);
+#endif
+
+    return status;
+}
+
+static
+size_t bin_search(
+#ifdef HAVE_MPI_LARGE_COUNT
+                  MPI_Count key, MPI_Count *base,
+#else
+                  MPI_Offset key, MPI_Offset *base,
+#endif
+                  size_t nmemb)
+{
+    size_t low, high;
+
+    /* only one element */
+    if (nmemb == 1)
+        return (base[0] <= key) ? 0 : -1;
+
+    /* check the 1st element */
+    if (base[0] <= key && key < base[1])
+        return 0;
+
+    low = 1;
+    high = nmemb - 1;
+
+    while (low <= high) {
+        size_t mid = low + (high - low) / 2;
+        if (base[mid] == key)
+            return mid;
+        if (base[mid] < key)
+            low = mid + 1;
+        else
+            high = mid - 1;
+    }
+    return (low - 1);
+}
+
+/*----< ina_get() >----------------------------------------------------------*/
+/* This subroutine implements the intra-node aggregation for read operations.
+ */
+static
+int ina_get(NC         *ncp,
+            int         is_incr,   /* if offsets are incremental */
+            MPI_Aint    num_pairs, /* number of offset-length pairs */
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Count  *offsets,
+            MPI_Count  *lengths,
+#else
+            MPI_Offset *offsets,
+            int        *lengths,
+#endif
+            PNCIO_View  buf_view,
+            void       *buf)      /* user buffer */
+{
+    int i, j, err, mpireturn, status=NC_NOERR, nreqs;
+    int do_sort=0, indv_sorted=1, overlap=0;
+    char *rd_buf = NULL;
+    MPI_Aint npairs=0, max_npairs, *meta=NULL, *count=NULL;
+    MPI_Offset send_amnt=0, rd_amnt=0, off_start;
+    MPI_Request *req=NULL;
+    PNCIO_View rd_buf_view;
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Count *off_ptr, *len_ptr, *orig_off_ptr, *orig_len_ptr;
+    MPI_Count bufLen, *orig_offsets=NULL, *orig_lengths=NULL;
+    MPI_Count *blks = NULL, *disps = NULL;
+#else
+    MPI_Offset *orig_offsets=NULL, *orig_off_ptr, *off_ptr;
+    int bufLen, *orig_lengths=NULL, *orig_len_ptr, *len_ptr, *blks = NULL;
+    MPI_Aint *disps = NULL;
+#endif
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    double endT, startT = MPI_Wtime();
+    MPI_Offset mem_max;
+    ncmpi_inq_malloc_size(&mem_max);
+    ncp->maxmem_get[0] = MAX(ncp->maxmem_get[0], mem_max);
+#endif
 
-        if (npairs == 1) {
-            /* No need to create fileType if writing to a contiguous space */
-            offset = offsets[0];
+    bufLen = buf_view.size;
+
+    /* Firstly, aggregators collect metadata from non-aggregators.
+     *
+     * This rank tells its aggregator how much metadata to receive from this
+     * rank, by sending
+     *     1. the number of offset-length pairs (num_pairs)
+     *     2. user buffer size in bytes (bufLen).
+     *     3. whether this rank's offsets are sorted in increasing order.
+     * This message size to be sent by this rank is 3 MPI_Offset.
+     */
+    if (ncp->rank == ncp->my_aggr)
+        meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * ncp->num_nonaggrs * 3);
+    else
+        meta = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * 3);
+
+    meta[0] = num_pairs;
+    meta[1] = bufLen;
+    meta[2] = is_incr;
+
+    /* Each aggregator first collects metadata about its offset-length pairs,
+     * size of read request, and whether the offsets are in an incremental
+     * order. The aggregator will gather these metadata from non-aggregators
+     * assigned to it.
+     *
+     * Once ina_collect_md() returns, this aggregator's offsets and lengths may
+     * grow to include the ones from non-aggregators (appended).
+     */
+    if (ncp->num_nonaggrs > 1) {
+        err = ina_collect_md(ncp, meta, &offsets, &lengths, &npairs);
+        if (err != NC_NOERR) {
+            NCI_Free(meta);
+            return err;
         }
-        else {
+    }
+    else
+        npairs = num_pairs;
+
+    if (ncp->rank != ncp->my_aggr) {
+        if (meta[0] > 0) {
+            /* For read operation, the non-aggregators now can start receiving
+             * their read data from the aggregator.
+             */
+            MPI_Status st;
 #ifdef HAVE_MPI_LARGE_COUNT
-            /* construct fileview */
-            mpireturn = MPI_Type_create_hindexed_c(npairs, lengths, offsets,
-                                                   MPI_BYTE, &fileType);
+            MPI_Count num = (buf_view.is_contig) ? buf_view.size : 1;
+            TRACE_COMM(MPI_Recv_c)(buf, num, buf_view.type, ncp->my_aggr, 0,
+                                   ncp->comm, &st);
+#else
+            int num = (buf_view.is_contig) ? buf_view.size : 1;
+            TRACE_COMM(MPI_Recv)(buf, num, buf_view.type, ncp->my_aggr, 0,
+                                 ncp->comm, &st);
+#endif
+        }
+
+        /* Must free offsets and lengths now, as they may be realloc-ed in
+         * ina_collect_md()
+         */
+        if (offsets != NULL) NCI_Free(offsets);
+        if (lengths != NULL) NCI_Free(lengths);
 
+        /* Non-aggregators are now done, as they do not participate MPI-IO or
+         * PNCIO file read.
+         */
+        NCI_Free(meta);
+        return status;
+    }
+
+    /* The remaining of this subroutine is for aggregators only. */
+
+    /* For read operation, the original offsets and lengths must be kept
+     * untouched, because the later sorting and coalescing will mess up the
+     * original order of offsets and lengths, which are needed to construct a
+     * datatype when an aggregator sends read data to its non-aggregators.
+     */
+#ifdef HAVE_MPI_LARGE_COUNT
+    orig_offsets = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * npairs);
+    orig_lengths = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * npairs);
+    memcpy(orig_offsets, offsets, sizeof(MPI_Count) * npairs);
+    memcpy(orig_lengths, lengths, sizeof(MPI_Count) * npairs);
 #else
-            /* construct fileview */
-            mpireturn = MPI_Type_create_hindexed(npairs, lengths, offsets,
-                                                 MPI_BYTE, &fileType);
+    orig_offsets = (MPI_Offset*) NCI_Malloc(sizeof(MPI_Offset) * npairs);
+    orig_lengths = (int*)        NCI_Malloc(sizeof(int) * npairs);
+    memcpy(orig_offsets, offsets, sizeof(MPI_Offset) * npairs);
+    memcpy(orig_lengths, lengths, sizeof(int) * npairs);
+#endif
+    orig_off_ptr = orig_offsets;
+    orig_len_ptr = orig_lengths;
+    off_ptr      = offsets;
+    len_ptr      = lengths;
 
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    ncmpi_inq_malloc_size(&mem_max);
+    ncp->maxmem_get[1] = MAX(ncp->maxmem_get[1], mem_max);
 #endif
-            if (mpireturn != MPI_SUCCESS) {
-                err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed");
-                /* return the first encountered error if there is any */
-                if (status == NC_NOERR) status = err;
+
+    /* MPI-IO has the following requirements about filetype.
+     * 1. The (flattened) displacements (of a filetype) are not required to be
+     *    distinct, but they cannot be negative, and they must be monotonically
+     *    non-decreasing.
+     * 2. If the file is opened for writing, neither the etype nor the filetype
+     *    is permitted to contain overlapping regions.
+     */
+    if (npairs > 0) {
+        /* Now this aggregator has received all offset-length pairs from its
+         * non-aggregators. At first, check if a sorting is necessary.
+         */
+
+        /* check if offsets of all non-aggregators are individual sorted */
+        indv_sorted = 1;
+        for (i=-1,j=0; j<ncp->num_nonaggrs; j++) {
+            if (i == -1 && meta[j*3] > 0) /* find 1st whose num_pairs > 0 */
+                i = j;
+            if (meta[j*3+2] == 0) { /* j's offsets are not sorted */
+                indv_sorted = 0;
+                do_sort = 1;
+                break;
             }
-            else {
-                mpireturn = MPI_Type_commit(&fileType);
-                if (mpireturn != MPI_SUCCESS) {
-                    err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_commit");
-                    /* return the first encountered error if there is any */
-                    if (status == NC_NOERR) status = err;
+        }
+        /* i is the first non-aggregator whose num_pairs > 0
+         * j is the first non-aggregator whose is_incr is false
+         */
+
+        if (i >= 0 && indv_sorted == 1) {
+            /* When all ranks' offsets are individually sorted, we still need
+             * to check if offsets are interleaved among all non-aggregators to
+             * determine whether a sort for all offset-length pairs is
+             * necessary.
+             */
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Count prev_end_off;
+#else
+            MPI_Offset prev_end_off;
+#endif
+            assert(meta[i*3+2] == 1);
+
+            MPI_Aint sum = meta[i*3];
+            prev_end_off = off_ptr[sum-1]; /* last offset of non-aggregator i */
+
+            /* check if the offsets are interleaved */
+            for (++i; i<ncp->num_nonaggrs; i++) {
+                if (meta[i*3] == 0) /* zero-sized request */
+                    continue;
+                assert(meta[i*3+2] == 1);
+                if (prev_end_off > off_ptr[sum]) {
+                    /* off_ptr[sum] is the non-aggregator i' 1st offset */
+                    do_sort = 1; /* offsets are not incrementing */
+                    break;
                 }
+                /* move on to next non-aggregator */
+                sum += meta[i*3];
+                prev_end_off = off_ptr[sum-1];
             }
         }
-        NCI_Free(offsets);
-        NCI_Free(lengths);
-    }
+
+        if (do_sort && indv_sorted) {
+            /* Interleaved offsets are found but individual offsets are already
+             * sorted. In this case, heap_merge() is called to merge all
+             * offsets into one single sorted offset list. Note count[] is
+             * initialized and will be used in heap_merge()
+             */
+            count = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint)* ncp->num_nonaggrs);
+            for (i=0; i<ncp->num_nonaggrs; i++) count[i] = meta[i*3];
+        }
+
+        /* Construct an array of buffer addresses containing a mapping of the
+         * buffer used to receive write data from non-aggregators and the
+         * buffer used to write to file.
+         */
+        if (do_sort) {
+            /* Sort offsets and lengths, based on offsets into an increasing
+             * order.
+             */
+            if (indv_sorted) {
+                /* heap-merge() runs much faster than qsort() when individual
+                 * lists have already been sorted. However, it has a much
+                 * bigger memory footprint.
+                 */
+                heap_merge(ncp->num_nonaggrs, count, npairs, off_ptr, len_ptr,
+                           NULL);
+                NCI_Free(count);
+            }
+            else
+                /* When some individual offsets are not sorted, we cannot use
+                 * heap_merge(). Note qsort() is an in-place sorting.
+                 */
+                qsort_off_len_buf(npairs, off_ptr, len_ptr, NULL);
+        }
 
 #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
-    ncp->aggr_time += MPI_Wtime() - timing;
+        ncmpi_inq_malloc_size(&mem_max);
+        ncp->maxmem_get[2] = MAX(ncp->maxmem_get[2], mem_max);
+        ncp->ina_npairs_get = MAX(ncp->ina_npairs_get, npairs);
 #endif
 
-    if (ncp->rank != ncp->my_aggr) /* non-aggregator writes nothing */
-        buf_count = 0;
+        /* Coalesce the offset-length pairs and calculate the total read amount
+         * and send amount by this aggregator.
+         */
+        overlap = 0;
+        send_amnt = rd_amnt = len_ptr[0];
+        for (i=0, j=1; j<npairs; j++) {
+            MPI_Offset gap;
+            send_amnt += len_ptr[j];
 
-    /* Only aggregators writes non-zero sized of data to the file. The
-     * non-aggregators participate the collective write call with zero-length
-     * write requests.
+            gap = off_ptr[i] + len_ptr[i] - off_ptr[j];
+            if (gap >= 0) { /* overlap detected, merge j into i */
+                /* when gap > 0,  pairs i and j overlap
+                 * when gap == 0, pairs i and j are contiguous
+                 */
+                MPI_Offset i_end, j_end;
+
+                if (gap > 0) overlap = 1;
+
+                i_end = off_ptr[i] + len_ptr[i];
+                j_end = off_ptr[j] + len_ptr[j];
+                if (i_end < j_end) {
+                    len_ptr[i] += j_end - i_end;
+                    rd_amnt += j_end - i_end;
+                }
+                /* else: j is entirely covered by i */
+            }
+            else { /* j and i are not overlapped */
+                rd_amnt += len_ptr[j];
+                i++;
+                if (i < j) {
+                    off_ptr[i] = off_ptr[j];
+                    len_ptr[i] = len_ptr[j];
+                }
+            }
+        }
+
+        /* update npairs after coalesce */
+        npairs = i+1;
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        ncmpi_inq_malloc_size(&mem_max);
+        ncp->maxmem_get[3] = MAX(ncp->maxmem_get[3], mem_max);
+#endif
+    } /* if (npairs > 0) */
+    /* else case: This aggregation group may not have data to read, but must
+     * participate the collective MPI-IO calls.
      */
-    fh = ncp->collective_fh;
 
-    /* set the MPI-IO fileview, this is a collective call */
-    err = ncmpio_file_set_view(ncp, fh, &offset, fileType);
-    if (fileType != MPI_BYTE) MPI_Type_free(&fileType);
+    /* set the fileview */
+    err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, npairs, off_ptr, len_ptr);
     if (err != NC_NOERR) {
         if (status == NC_NOERR) status = err;
-        buf_count = 0;
+        rd_amnt = 0;
+    }
+
+    /* Allocate read buffer and send buffer. Once data are read from file into
+     * rd_buf, it is unpacked into send_buf for each non-aggregator. send_buf
+     * will be directly used to send the read request data to non-aggregators.
+     *
+     * Note rd_amnt may not be the same as send_amnt, as there can be overlaps
+     * between adjacent offset-length pairs after sorted.
+     *
+     * If file offset-length pairs have not been re-ordered, i.e. sorted and
+     * overlaps removed, and this aggregator will not send any read data to its
+     * non-aggregators, then we can use user's buffer, buf, to call
+     * MPI-IO/PNCIO to read from the file, without allocating an additional
+     * temporary buffer.
+     */
+    if (!do_sort && buf_view.size == send_amnt && !overlap) {
+        rd_buf_view = buf_view;
+        rd_buf = buf;
     }
+    else {
+        /* Read data will be stored in a contiguous read buffer. */
+        rd_buf_view.size = rd_amnt;
+        rd_buf_view.type = MPI_BYTE;
+        rd_buf_view.is_contig = 1;
+        if (rd_amnt > 0)
+            rd_buf = (char*) NCI_Malloc(rd_amnt);
+    }
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    ncmpi_inq_malloc_size(&mem_max);
+    ncp->maxmem_get[4] = MAX(ncp->maxmem_get[4], mem_max);
+    endT = MPI_Wtime();
+    ncp->ina_time_get[0] += endT - startT;
+#endif
 
-    /* call MPI_File_write_at_all */
-    err = ncmpio_read_write(ncp, NC_REQ_WR, NC_REQ_COLL, offset, buf_count,
-                            MPI_BYTE, wr_buf, 1);
+    err = ncmpio_read_write(ncp, NC_REQ_RD, 0, rd_buf_view, rd_buf);
     if (status == NC_NOERR) status = err;
 
-    if (wr_buf != NULL) NCI_Free(wr_buf);
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    ncmpi_inq_malloc_size(&mem_max);
+    ncp->maxmem_get[5] = MAX(ncp->maxmem_get[5], mem_max);
+    startT = MPI_Wtime();
+#endif
+
+    /* If sorting has been performed, the orders of off_ptr[] and len_ptr[] may
+     * no longer be the same as the original ones. We must use binary search to
+     * find the aggregated offset-length pair containing each non-aggregator's
+     * offset-length pair to construct a send buffer datatype, a view layout to
+     * the read buffer, rd_buf, so the data can be directly sent from rd_buf.
+     */
+    if (rd_buf != buf) {
+        /* First, aggregators copy the read data to their own user buffer.
+         * Note off_ptr[] is sorted in an incremental order.
+         *
+         * When the offset-length pairs of read buffer have been sorted or
+         * the read buffer size is smaller than the total get amount, we must
+         * search and copy from read buffer to self's user buffer.
+         */
+        char *ptr=NULL, *tmp_buf=NULL;
+        size_t m=0, k, scan_off=0;
+
+        /* If this aggregator's user buftype is contiguous, the reuse its
+         * read buffer. If not, allocate a temporary buffer, copy the read
+         * data over, and then unpacking it to the user buffer.
+         */
+        if (buf_view.is_contig)
+            ptr = buf;
+        else if (bufLen > 0)
+            ptr = tmp_buf = (char*) NCI_Malloc(bufLen);
+
+        for (j=0; j<num_pairs; j++) {
+            /* For each offset-length pair j, find the offset-length pair
+             * in rd_buf containing it. Note that if the offset-length
+             * pairs are not already sorted, i.e. is_incr != 1, this
+             * bin_search() below can be very expensive!
+             * orig_off_ptr[] and orig_len_ptr[] are the original offsets
+             *     and lengths of this rank. We cannot use offsets[] and
+             *     lengths[] as they contain offset-length pair of all
+             *     non-aggregators and may have been sorted.
+             * off_ptr[] and len_ptr[] describe the offset-length pairs of
+             *     the read buffer, rd_buf.
+             */
+            if (!is_incr) m = 0;
+            if (npairs-m == 1) assert(off_ptr[m] <= orig_off_ptr[j]);
+            k = bin_search(orig_off_ptr[j], &off_ptr[m], npairs-m);
+            assert(k < npairs);
+            /* k returned from bin_search is relative to m */
+            k += m;
+
+            /* When is_incr is 1, the orig_off_ptr[] are in an incremental
+             * order and we can continue binary search using the index from
+             * previous search. When is_incr is 0, the orig_off_ptr[] are
+             * NOT in an incremental order, we must do binary search on the
+             * entire off_ptr[].
+             */
+            if (!is_incr) scan_off = 0;
+            for (; m<k; m++)
+                scan_off += len_ptr[m];
+
+            /* Note orig_off_ptr[j] and orig_len_ptr[j] must entirely
+             * covered by off_ptr[k] and len_ptr[k], because off_ptr[] and
+             * len_ptr[] have been coalesced.
+             */
+
+            memcpy(ptr,
+                   rd_buf + (scan_off + orig_off_ptr[j] - off_ptr[k]),
+                   orig_len_ptr[j]);
+
+            ptr += orig_len_ptr[j];
+        }
+
+        /* unpack read data to user read buffer, if not done already */
+        if (bufLen > 0 && !buf_view.is_contig) {
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Count pos=0;
+            MPI_Unpack_c(tmp_buf, bufLen, &pos, buf, 1, buf_view.type,
+                         MPI_COMM_SELF);
+#else
+            int pos=0;
+            MPI_Unpack(tmp_buf, bufLen, &pos, buf, 1, buf_view.type,
+                       MPI_COMM_SELF);
+#endif
+            NCI_Free(tmp_buf);
+        }
+    }
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    endT = MPI_Wtime();
+    ncp->ina_time_get[1] += endT - startT;
+    startT = endT;
+#endif
+
+    if (ncp->num_nonaggrs == 1)
+        /* In this case, communication will not be necessary. */
+        goto fn_exit;
+
+    /* Aggregators start sending read data to non-aggregators. At first,
+     * allocate array_of_blocklengths[] and array_of_displacements[]
+     */
+    for (max_npairs=0, i=1; i<ncp->num_nonaggrs; i++)
+        max_npairs = MAX(meta[3*i], max_npairs);
+
+#ifdef HAVE_MPI_LARGE_COUNT
+    blks = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * max_npairs);
+    disps = (MPI_Count*) NCI_Malloc(sizeof(MPI_Count) * max_npairs);
+#else
+    blks = (int*) NCI_Malloc(sizeof(int) * max_npairs);
+    disps = (MPI_Aint*) NCI_Malloc(sizeof(MPI_Aint) * max_npairs);
+#endif
+
+    /* Now, send data to each non-aggregator */
+    req = (MPI_Request*)NCI_Malloc(sizeof(MPI_Request) * ncp->num_nonaggrs);
+    nreqs = 0;
+    off_start = meta[0];
+    for (i=1; i<ncp->num_nonaggrs; i++) {
+        /* populate disps[] and blks[] */
+        MPI_Aint remote_num_pairs = meta[3*i];
+        MPI_Aint remote_is_incr = meta[3*i+2];
+
+        if (remote_num_pairs == 0) continue; /* zero sized request */
+
+#ifdef HAVE_MPI_LARGE_COUNT
+        MPI_Count *off = orig_off_ptr + off_start;
+        MPI_Count *len = orig_len_ptr + off_start;
+#else
+        MPI_Offset *off = orig_off_ptr + off_start;
+        int        *len = orig_len_ptr + off_start;
+#endif
+        size_t k, m = 0;
+        size_t scan_off = 0;
+        for (j=0; j<remote_num_pairs; j++) {
+            MPI_Aint addr;
+
+            /* Find the offset-length pair in rd_buf containing this pair.
+             * Note that if the offset-length pairs are not already sorted,
+             * i.e. remote_is_incr == 1, this bin_search() below can be very
+             * expensive!
+             */
+            if (!remote_is_incr) m = 0;
+
+            if (npairs-m == 1) assert(off_ptr[m] <= off[j]);
+            k = bin_search(off[j], &off_ptr[m], npairs-m);
+            /* k returned from bin_search is relative to m */
+            k += m;
+            assert(off_ptr[k] <= off[j] && off[j] < off_ptr[k] + len_ptr[k]);
+
+            /* When is_incr is 1, the orig_off_ptr[] are in an incremental
+             * order, we can continue binary search using the index from the
+             * previous search.  When is_incr is 0, the orig_off_ptr[] are NOT
+             * in an incremental order, we must do binary search on the entire
+             * off_ptr[].
+             */
+            if (!remote_is_incr) scan_off = 0;
+            for (; m<k; m++)
+                scan_off += len_ptr[m];
+            /* Note orig_off_ptr[j] and len_ptr[j] must entirely covered by
+             * off_ptr[k] and len_ptr[k], because off_ptr[] and len_ptr[] have
+             * been coalesced.
+             */
+            char *ptr = rd_buf + (scan_off + off[j] - off_ptr[k]);
+            MPI_Get_address(ptr, &addr);
+            disps[j] = addr;
+            blks[j] = len[j];
+        }
+        off_start += remote_num_pairs;
+
+        /* Construct a send buffer MPI datatype */
+        MPI_Datatype sendType;
+#ifdef HAVE_MPI_LARGE_COUNT
+        mpireturn = MPI_Type_create_hindexed_c(remote_num_pairs, blks, disps,
+                                               MPI_BYTE, &sendType);
+#else
+        mpireturn = MPI_Type_create_hindexed(remote_num_pairs, blks, disps,
+                                             MPI_BYTE, &sendType);
+#endif
+        if (mpireturn != MPI_SUCCESS) {
+            err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed");
+            /* return the first encountered error if there is any */
+            if (status == NC_NOERR) status = err;
+        }
+        else {
+            MPI_Type_commit(&sendType);
+
+#ifdef HAVE_MPI_LARGE_COUNT
+            TRACE_COMM(MPI_Isend_c)(MPI_BOTTOM, 1, sendType,
+                       ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]);
+#else
+            TRACE_COMM(MPI_Isend)(MPI_BOTTOM, 1, sendType,
+                       ncp->nonaggr_ranks[i], 0, ncp->comm, &req[nreqs++]);
+#endif
+            MPI_Type_free(&sendType);
+        }
+    }
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    endT = MPI_Wtime();
+    ncp->ina_time_get[2] += endT - startT;
+    startT = endT;
+#endif
+
+    if (nreqs > 0) {
+#ifdef HAVE_MPI_STATUSES_IGNORE
+        TRACE_COMM(MPI_Waitall)(nreqs, req, MPI_STATUSES_IGNORE);
+#else
+        MPI_Status *statuses = (MPI_Status *)
+                               NCI_Malloc(nreqs * sizeof(MPI_Status));
+        TRACE_COMM(MPI_Waitall)(nreqs, req, statuses);
+        NCI_Free(statuses);
+#endif
+        if (mpireturn != MPI_SUCCESS) {
+            err = ncmpii_error_mpi2nc(mpireturn,"MPI_Waitall");
+            /* return the first encountered error if there is any */
+            if (status == NC_NOERR) status = err;
+        }
+    }
+    NCI_Free(blks);
+    NCI_Free(disps);
+
+fn_exit:
+    /* offsets[] and lengths[] are used in PNCIO read subroutines as flattened
+     * filetype. They cannot be freed before the I/O is done.
+     */
+    if (rd_buf != NULL && rd_buf != buf) NCI_Free(rd_buf);
+    if (orig_lengths != NULL) NCI_Free(orig_lengths);
+    if (orig_offsets != NULL) NCI_Free(orig_offsets);
+    if (req != NULL) NCI_Free(req);
+    if (meta != NULL) NCI_Free(meta);
+
+    /* Must free offsets and lengths now, as they may be realloc-ed in
+     * ina_collect_md()
+     */
+    if (offsets != NULL) NCI_Free(offsets);
+    if (lengths != NULL) NCI_Free(lengths);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    endT = MPI_Wtime();
+    ncp->ina_time_get[3] += endT - startT;
+#endif
 
     return status;
 }
 
-/*----< ncmpio_intra_node_aggregation_nreqs() >------------------------------*/
-/* This is a collective call */
+/*----< req_compare() >------------------------------------------------------*/
+/* used to sort the the string file offsets of reqs[] */
+static int
+req_compare(const void *a, const void *b)
+{
+    if (((NC_req*)a)->offset_start > ((NC_req*)b)->offset_start) return (1);
+    if (((NC_req*)a)->offset_start < ((NC_req*)b)->offset_start) return (-1);
+    return (0);
+}
+
+/*----< ncmpio_ina_nreqs() >-------------------------------------------------*/
+/* This subroutine handles PnetCDF's requests made from non-blocking APIs,
+ * which contain multiple requests to one or more variable. The input arguments
+ * are described below.
+ *    reqMode: NC_REQ_RD for read request and NC_REQ_WR for write.
+ *    num_reqs: number of elements in array req_list.
+ *    req_list[]: stores pending requests from non-blocking API calls, which is
+ *                used to construct file offset-length pairs and user buffer
+ *                datatype.
+ *    newnumrecs: number of new records
+ */
 int
-ncmpio_intra_node_aggregation_nreqs(NC         *ncp,
-                                    int         reqMode,
-                                    int         num_reqs,
-                                    NC_req     *put_list,
-                                    MPI_Offset  newnumrecs)
+ncmpio_ina_nreqs(NC         *ncp,
+                 int         reqMode,
+                 int         num_reqs,
+                 NC_req     *req_list,
+                 MPI_Offset  newnumrecs)
 {
-    int err, status=NC_NOERR;
-    MPI_Aint bufLen, num_pairs;
+    int err, status=NC_NOERR, is_incr=1;
+    void *buf=NULL;
+    MPI_Aint num_pairs;
 #ifdef HAVE_MPI_LARGE_COUNT
     MPI_Count *offsets=NULL, *lengths=NULL;
 #else
-    MPI_Aint *offsets=NULL;
+    MPI_Offset *offsets=NULL;
     int *lengths=NULL;
 #endif
-    MPI_Datatype bufType=MPI_BYTE;
 #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
     double timing = MPI_Wtime();
 #endif
 
-    /* currently supports write requests only */
-    if (fIsSet(reqMode, NC_REQ_RD)) return NC_NOERR;
+// printf("%s at %d: rank=%d num_aggrs_per_nod =%d my_aggr=%d num_nonaggrs=%d\n",__func__,__LINE__, ncp->rank, ncp->num_aggrs_per_node, ncp->my_aggr, ncp->num_nonaggrs);
+
+    /* populate reqs[].offset_start, starting offset of each request */
+    NC_req *reqs = req_list;
+    int i, descreasing=0;
+    for (i=0; i<num_reqs; i++) {
+        NC_lead_req *lead;
+        NC_var *varp;
+
+        lead = (reqMode == NC_REQ_RD) ? ncp->get_lead_list
+                                      : ncp->put_lead_list;
+        lead += reqs[i].lead_off;
+        varp = lead->varp;
+
+        if (varp->ndims == 0) { /* scalar variable */
+            reqs[i].offset_start += varp->begin;
+        }
+        else if (reqs[i].npairs == 1) { /* only one offset-length pair */
+            MPI_Offset off = varp->begin;
+
+            if (IS_RECVAR(varp)) off += reqs[i].start[0] * ncp->recsize;
 
-    assert(ncp->my_aggr >= 0);
+// printf("%s at %d: num_reqs=%d reqs[%d].npairs == 1 offset_start=%lld off=%lld\n", __func__,__LINE__,num_reqs,i,reqs[i].offset_start,off);
+            reqs[i].offset_start += off;
+        }
+        else {
+            /* start/count/stride have been allocated in a contiguous array */
+            MPI_Offset *count, *stride, offset_end;
+            count  = reqs[i].start + varp->ndims;
+            stride = (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) ? NULL :
+                     count + varp->ndims;
+
+            /* calculate access range of this request */
+            ncmpio_calc_start_end(ncp, varp, reqs[i].start, count, stride,
+                                  &reqs[i].offset_start, &offset_end);
+        }
+        /* check if offset_start are in a monotonic nondecreasing order */
+        if (i > 0 && reqs[i].offset_start < reqs[i-1].offset_start)
+            descreasing = 1;
+    }
+
+    /* If a decreasing order is found, sort reqs[] based on reqs[].offset_start
+     * into an increasing order.
+     */
+    if (descreasing)
+        qsort(reqs, (size_t)num_reqs, sizeof(NC_req), req_compare);
+
+// printf("%s at %d: descreasing=%d\n",__func__,__LINE__, descreasing);
 
     /* construct file offset-length pairs
      *     num_pairs: total number of off-len pairs
      *     offsets:   array of flattened offsets
      *     lengths:   array of flattened lengths
+     *     is_incr:   whether offsets are incremental
      */
     if (num_reqs > 0)
-        flatten_reqs(ncp, num_reqs, put_list, &num_pairs, &offsets, &lengths);
+        flatten_reqs(ncp, reqMode, num_reqs, reqs, &is_incr, &num_pairs,
+                     &offsets, &lengths);
     else
         num_pairs = 0;
 
-    /* construct write buffer datatype, bufType.
-     * bufLen is the buffer size in bytes
+#if 0
+if (0 && num_pairs==10) printf("%s at %d: num_reqs=%d num_pairs=%ld off=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld len=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",__func__,__LINE__, num_reqs, num_pairs,
+offsets[0],offsets[1],offsets[2],offsets[3],offsets[4],offsets[5],
+offsets[6],offsets[7],offsets[8],offsets[9],
+lengths[0],lengths[1],lengths[2],lengths[3],lengths[4],lengths[5],
+lengths[6],lengths[7],lengths[8],lengths[9]);
+
+else if (num_pairs==12) printf("%s at %d: num_reqs=%d num_pairs=%ld off=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld len=%lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",__func__,__LINE__, num_reqs, num_pairs,
+offsets[0],offsets[1],offsets[2],offsets[3],offsets[4],
+offsets[5],offsets[6],offsets[7],offsets[8],offsets[9],
+offsets[10],offsets[11],
+lengths[0],lengths[1],lengths[2],lengths[3],lengths[4],
+lengths[5],lengths[6],lengths[7],lengths[8],lengths[9],
+lengths[10],lengths[11]);
+else if (num_pairs) printf("%s at %d: num_reqs=%d num_pairs=%ld off=%lld len=%lld\n",__func__,__LINE__, num_reqs, num_pairs,offsets[0],lengths[0]);
+#endif
+
+    /* Populate buf_view, which contains metadata of the user buffers in the
+     * nonblocking requests. If buf is non-contiguous, buf to NULL and
+     * buf_view.type will be a derived datatype constructed using MPI_BOTTOM.
      */
-    if (num_reqs > 0) {
-        construct_buf_type(ncp, num_reqs, put_list, &bufLen, &bufType);
-        bufLen = 1;
-    }
-    else
-        bufLen = 0;
+    PNCIO_View buf_view;
+    err = flat_buf_type(ncp, reqMode, num_reqs, reqs, &buf_view, &buf);
+    if (status == NC_NOERR) status = err;
+if (num_reqs > 0) assert(buf != NULL);
+
+#if 0
+if (buf_view.count > 1) printf("%s at %d: buf_view count=%lld off=%lld %lld len=%lld %lld\n",__func__,__LINE__, buf_view.count, buf_view.off[0], buf_view.off[1], buf_view.len[0],buf_view.len[1]);
+else if (buf_view.count) printf("%s at %d: buf_view count=%lld off=%lld len=%lld\n",__func__,__LINE__, buf_view.count, buf_view.off[0], buf_view.len[0]);
+
+{int *wkl;
+int nelems, j,k, xsz=4;
+char *xbuf, msg[1024],str[64];
+printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size);
+    wkl = (int*) malloc(buf_view.size);
+    nelems=buf_view.size/xsz;
+    xbuf = buf;
+    memcpy(wkl, xbuf, buf_view.size); ncmpii_in_swapn(wkl, nelems, xsz);
+    sprintf(msg,"%s at %d: nelems=%d buf=(%p) ",__func__,__LINE__, nelems, xbuf);
+    for (k=0; k<nelems; k++) { sprintf(str," %d",wkl[k]); strcat(msg, str);}
+    printf("%s\n",msg);
+    free(wkl);
+}
+#endif
 
-    if (put_list != NULL)
-        NCI_Free(put_list);
+// if (reqMode == NC_REQ_RD) printf("%s at %d: buf_view count=%lld size=%lld is_contig=%d type=%s\n",__func__,__LINE__, buf_view.count, buf_view.size, buf_view.is_contig, (buf_view.type == MPI_BYTE)?"MPI_BYTE":"NOT MPI_BYTE");
+
+    if (req_list != NULL)
+        /* All metadata in req_list have been used to construct bufType and
+         * bufLen. It is now safe to release the space occupied by req_list.
+         */
+        NCI_Free(req_list);
 
 #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
-    ncp->aggr_time += MPI_Wtime() - timing;
+    if (ncp->rank == ncp->my_aggr) ncp->ina_time_flatten += MPI_Wtime() - timing;
 #endif
 
-    err = intra_node_aggregation(ncp, num_pairs, offsets, lengths, bufLen,
-                                 bufType, NULL);
+    int saved_my_aggr, saved_num_nonaggrs;
+    saved_my_aggr = ncp->my_aggr;
+    saved_num_nonaggrs = ncp->num_nonaggrs;
+    if (ncp->num_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) {
+        /* Temporarily set ncp->my_aggr and ncp->num_nonaggrs to be as if
+         * self rank is an INA aggregator and the INA group size is 1.
+         */
+        ncp->my_aggr = ncp->rank;
+        ncp->num_nonaggrs = 1;
+    }
+
+// printf("%s at %d: is_incr=%d buf=%p\n",__func__,__LINE__, is_incr,buf);
+    /* perform intra-node aggregation */
+    if (fIsSet(reqMode, NC_REQ_WR))
+        err = ina_put(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf);
+    else
+        err = ina_get(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf);
     if (status == NC_NOERR) status = err;
 
-    /* free and reset bufType */
-    if (bufType != MPI_BYTE && bufType != MPI_DATATYPE_NULL)
-        MPI_Type_free(&bufType);
+    if (ncp->num_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) {
+        /* restore ncp->my_aggr and ncp->num_nonaggrs */
+        ncp->my_aggr = saved_my_aggr;
+        ncp->num_nonaggrs = saved_num_nonaggrs;
+    }
+
+#if 0
+if (fIsSet(reqMode, NC_REQ_RD))
+{int *wkl;
+int nelems, j,k, xsz=4;
+char *xbuf, msg[1024],str[64];
+printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size);
+    wkl = (int*) malloc(buf_view.size);
+    nelems=buf_view.size/xsz;
+    xbuf = buf;
+    memcpy(wkl, xbuf, buf_view.size); ncmpii_in_swapn(wkl, nelems, xsz);
+    sprintf(msg,"%s at %d: nelems=%d buf=(%p) ",__func__,__LINE__, nelems, xbuf);
+    for (k=0; k<nelems; k++) { sprintf(str," %d",wkl[k]); strcat(msg, str);}
+    printf("%s\n",msg);
+    free(wkl);
+}
+#endif
+
+    if (buf_view.type != MPI_BYTE) MPI_Type_free(&buf_view.type);
+    if (buf_view.off != NULL) NCI_Free(buf_view.off);
+    if (buf_view.len != NULL) NCI_Free(buf_view.len);
 
     /* Update the number of records if new records have been created.
      * For nonblocking APIs, there is no way for a process to know whether
@@ -1344,59 +2842,135 @@ ncmpio_intra_node_aggregation_nreqs(NC         *ncp,
     return status;
 }
 
-/*----< ncmpio_intra_node_aggregation() >------------------------------------*/
-/* This is a collective call */
+/*----< ncmpio_ina_req() >---------------------------------------------------*/
+/* This subroutine handles a single request made by blocking APIs, involving
+ * only one variable. Below describe the subroutine arguments.
+ *    reqMode: NC_REQ_RD for read request and NC_REQ_WR for write.
+ *    varp: pointer to the variable struct.
+ *    start[]: starting offsets
+ *    count[]: counts along each dimension
+ *    stride[]: stride along each dimension
+ *    buf_len: size of I/O buffer in bytes
+ *    buf: pointer to the user buffer
+ */
 int
-ncmpio_intra_node_aggregation(NC               *ncp,
-                              int               reqMode,
-                              NC_var           *varp,
-                              const MPI_Offset *start,
-                              const MPI_Offset *count,
-                              const MPI_Offset *stride,
-                              MPI_Offset        bufCount,
-                              MPI_Datatype      bufType,
-                              void             *buf)
+ncmpio_ina_req(NC               *ncp,
+               int               reqMode,
+               NC_var           *varp,
+               const MPI_Offset *start,
+               const MPI_Offset *count,
+               const MPI_Offset *stride,
+               MPI_Offset        buf_len,
+               void             *buf)
 {
-    int err, status=NC_NOERR;
+    int err, status=NC_NOERR, is_incr=1;
     MPI_Aint num_pairs;
+    PNCIO_View buf_view;
 #ifdef HAVE_MPI_LARGE_COUNT
     MPI_Count *offsets=NULL, *lengths=NULL;
 #else
-    MPI_Aint *offsets=NULL;
+    MPI_Offset *offsets=NULL;
     int *lengths=NULL;
 #endif
 #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
     double timing = MPI_Wtime();
 #endif
 
-    /* currently supports write requests only */
-    if (fIsSet(reqMode, NC_REQ_RD)) return NC_NOERR;
-
-    if (buf == NULL) /* zero-length request */
-        return intra_node_aggregation(ncp, 0, NULL, NULL, 0, MPI_BYTE, NULL);
-
-    /* construct file offset-length pairs
-     *     num_pairs: total number of off-len pairs
-     *     offsets:   array of flattened offsets
-     *     lengths:   array of flattened lengths
-     */
-    err = flatten_req(ncp, varp, start, count, stride, &num_pairs, &offsets,
-                      &lengths);
-    if (err != NC_NOERR) {
+    /* blocking API's buffer passed here is always contiguous */
+    buf_view.type = MPI_BYTE;
+    buf_view.is_contig = 1;
+    buf_view.size = buf_len;
+    buf_view.count = 0;
+    buf_view.off = NULL;
+    buf_view.len = NULL;
+
+// printf("%s at %d: buf=%s\n",__func__,__LINE__, (buf==NULL)?"NULL":"NOT NULL");
+    if (buf_len == 0 || buf == NULL) {
+        /* This is a zero-length request. When in collective data mode, this
+         * rank must still participate collective calls. When INA is enabled,
+         * this rank tells its aggregator that it has no I/O data. When INA is
+         * disabled, this rank must participate other collective file call.
+         */
         num_pairs = 0;
-        if (offsets != NULL)
-            NCI_Free(offsets);
-        offsets = NULL;
+        buf_view.size = 0;
+        buf_view.count = 0;
     }
-    status = err;
+    else {
+        /* construct file access offset-length pairs
+         *     num_pairs: total number of off-len pairs
+         *     offsets:   array of flattened offsets
+         *     lengths:   array of flattened lengths
+         *     is_incr:   whether offsets are incremental
+         */
+        err = flatten_req(ncp, varp, start, count, stride, &is_incr,
+                          &num_pairs, &offsets, &lengths);
+        if (err != NC_NOERR) { /* make this rank zero-sized request */
+            is_incr = 1;
+            num_pairs = 0;
+            buf_len = 0;
+            buf_view.size = 0;
+            buf_view.count = 0;
+            if (offsets != NULL) NCI_Free(offsets);
+            if (lengths != NULL) NCI_Free(lengths);
+            offsets = NULL;
+            lengths = NULL;
+        }
+        status = err;
+    }
+// if (num_pairs > 0) printf("%s at %d: num_pairs=%ld off=%lld len=%lld\n",__func__,__LINE__, num_pairs,offsets[0],lengths[0]);
 
 #if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
-    ncp->aggr_time += MPI_Wtime() - timing;
+    if (ncp->rank == ncp->my_aggr)
+        ncp->ina_time_flatten += MPI_Wtime() - timing;
 #endif
 
-    err = intra_node_aggregation(ncp, num_pairs, offsets, lengths, bufCount,
-                                 bufType, buf);
-    if (status == NC_NOERR) status = err;
+    int saved_my_aggr, saved_num_nonaggrs;
+    saved_my_aggr = ncp->my_aggr;
+    saved_num_nonaggrs = ncp->num_nonaggrs;
+    if (ncp->num_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) {
+        /* Temporarily set ncp->my_aggr and ncp->num_nonaggrs to be as if
+         * self rank is an INA aggregator and the INA group size is 1.
+         */
+        ncp->my_aggr = ncp->rank;
+        ncp->num_nonaggrs = 1;
+    }
+// if (num_pairs) printf("%s at %d: num_pairs=%ld off=%lld len=%lld\n",__func__,__LINE__, num_pairs,offsets[0],lengths[0]);
+// if (buf_view.count) printf("%s at %d: buf_view count=%lld off=%lld len=%lld\n",__func__,__LINE__, buf_view.count, buf_view.off[0], buf_view.len[0]);
+
+// printf("%s at %d: buf_view count=%lld size=%lld is_contig=%d buf=%p\n",__func__,__LINE__, buf_view.count,buf_view.size,buf_view.is_contig,buf);
+    /* perform intra-node aggregation */
+    if (fIsSet(reqMode, NC_REQ_WR)) {
+        err = ina_put(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf);
+        if (status == NC_NOERR) status = err;
+    }
+    else {
+        err = ina_get(ncp, is_incr, num_pairs, offsets, lengths, buf_view, buf);
+        if (status == NC_NOERR) status = err;
+    }
+
+#if 0
+if (fIsSet(reqMode, NC_REQ_RD))
+{unsigned long long *wkl; int xsz=8; // int *wkl; int xsz=4;
+int nelems, j,k;
+char *xbuf, msg[1024],str[64];
+printf("%s at %d: buf_view count=%lld size=%lld\n",__func__,__LINE__, buf_view.count,buf_view.size);
+    wkl = (unsigned long long*) malloc(buf_view.size); // wkl = (int*) malloc(buf_view.size);
+    nelems=buf_view.size/xsz;
+    xbuf = buf;
+    memcpy(wkl, xbuf, buf_view.size); ncmpii_in_swapn(wkl, nelems, xsz);
+    sprintf(msg,"%s at %d: %s nelems=%d buf=(%p) ",__func__,__LINE__, ncp->path,nelems, xbuf);
+    // for (k=0; k<nelems; k++) { sprintf(str," %d",wkl[k]); strcat(msg, str);}
+    for (k=0; k<nelems; k++) { sprintf(str," %llu",wkl[k]); strcat(msg, str);}
+    printf("%s\n",msg);
+    free(wkl);
+}
+#endif
+
+    if (ncp->num_aggrs_per_node == 0 || fIsSet(ncp->flags, NC_MODE_INDEP)) {
+        /* restore ncp->my_aggr and ncp->num_nonaggrs */
+        ncp->my_aggr = saved_my_aggr;
+        ncp->num_nonaggrs = saved_num_nonaggrs;
+    }
 
     return status;
 }
diff --git a/src/drivers/ncmpio/ncmpio_open.c b/src/drivers/ncmpio/ncmpio_open.c
index a24726ee9..7d5ec6d23 100644
--- a/src/drivers/ncmpio/ncmpio_open.c
+++ b/src/drivers/ncmpio/ncmpio_open.c
@@ -39,92 +39,91 @@ ncmpio_open(MPI_Comm     comm,
             MPI_Info     user_info, /* user's and env info combined */
             void       **ncpp)
 {
-    char *env_str, *mpi_name;
-    int i, mpiomode, err, status=NC_NOERR, mpireturn;
-    MPI_File fh;
-    MPI_Info info_used;
+    char *filename, *env_str, value[MPI_MAX_INFO_VAL + 1], *mpi_name;
+    int i, rank, nprocs, mpiomode, err, status=NC_NOERR, mpireturn, flag;
+    MPI_File fh=MPI_FILE_NULL;
     NC *ncp=NULL;
 
     *ncpp = NULL;
 
+    MPI_Comm_rank(comm, &rank);
+    MPI_Comm_size(comm, &nprocs);
+
     /* Note path's validity and omode consistency have been checked in
-     * ncmpi_open() in src/dispatchers/file.c and
-     * path consistency will be done in MPI_File_open */
+     * ncmpi_open() in src/dispatchers/file.c and path consistency will be done
+     * in MPI_File_open.
+     */
 
     /* First, check whether omode is valid or supported ---------------------*/
+
     /* NC_DISKLESS is not supported yet */
     if (omode & NC_DISKLESS) DEBUG_RETURN_ERROR(NC_EINVAL_OMODE)
 
     /* NC_MMAP is not supported yet */
     if (omode & NC_MMAP) DEBUG_RETURN_ERROR(NC_EINVAL_OMODE)
 
-#if 0 && defined(HAVE_ACCESS)
-    if (mpiomode == MPI_MODE_RDONLY) { /* file should already exit */
-        int rank, file_exist;
-        MPI_Comm_rank(comm, &rank);
-        if (rank == 0) {
-            if (access(path, F_OK) == 0) file_exist = 1;
-            else                         file_exist = 0;
-        }
-        TRACE_COMM(MPI_Bcast)(&file_exist, 1, MPI_INT, 0, comm);
-        if (!file_exist) DEBUG_RETURN_ERROR(NC_ENOENT)
-    }
-#endif
+    /* allocate buffer for header object NC and initialize its contents */
+    ncp = (NC*) NCI_Calloc(1, sizeof(NC));
+    if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM)
 
-    /* open file collectively ---------------------------------------------- */
-    mpiomode = fIsSet(omode, NC_WRITE) ? MPI_MODE_RDWR : MPI_MODE_RDONLY;
+    *ncpp = (void*)ncp;
 
-    TRACE_IO(MPI_File_open, (comm, (char *)path, mpiomode, user_info, &fh));
-    if (mpireturn != MPI_SUCCESS)
-        return ncmpii_error_mpi2nc(mpireturn, mpi_name);
+    ncp->ncid     = ncid;
+    ncp->comm     = comm;     /* reuse comm duplicated in dispatch layer */
+    ncp->rank     = rank;
+    ncp->nprocs   = nprocs;
 
-    /* get the file info used/modified by MPI-IO */
-    TRACE_IO(MPI_File_get_info, (fh, &info_used));
-    if (mpireturn != MPI_SUCCESS)
-        return ncmpii_error_mpi2nc(mpireturn, mpi_name);
+    /* Extract hints from user_info. Two hints must be extracted now in order
+     * to continue:
+     *     nc_pncio: whether to user MPI-IO or PnetCDF's PNCIO driver.
+     *     nc_num_aggrs_per_node: number of processes per node to be INA
+     *     aggregators.
+     *
+     * ncp->fstype will be set in ncmpio_hint_extract().
+     */
+    ncmpio_hint_extract(ncp, user_info);
 
-    /* Now the file has been successfully opened, allocate/set NC object */
+    if (ncp->fstype == PNCIO_FSTYPE_CHECK)
+        /* Check file system type. If the given file does not exist, check its
+         * folder. Currently PnetCDF's PNCIO drivers support Lustre
+         * (PNCIO_LUSTRE) and Unix File System (PNCIO_UFS).
+         */
+        ncp->fstype = PNCIO_FileSysType(path);
 
-    /* path's validity and omode consistency have been checked in ncmpi_open()
-     * in src/dispatchers/file.c */
+#ifdef WKL_DEBUG
+if (rank == 0) printf("%s at %d fstype=%s\n", __func__,__LINE__,(ncp->fstype == PNCIO_FSTYPE_MPIIO)? "PNCIO_FSTYPE_MPIIO" : (ncp->fstype == PNCIO_LUSTRE) ? "PNCIO_LUSTRE" : "PNCIO_UFS");
+#endif
 
-    /* allocate buffer for header object NC */
-    ncp = (NC*) NCI_Calloc(1, sizeof(NC));
-    if (ncp == NULL) DEBUG_RETURN_ERROR(NC_ENOMEM)
+    /* Remove the file system type prefix name if there is any. For example,
+     * when path = "lustre:/home/foo/testfile.nc", remove "lustre:" to make
+     * filename pointing to "/home/foo/testfile.nc", so it can be used in POSIX
+     * access() below
+     */
+    filename = ncmpii_remove_file_system_type_prefix(path);
 
-    /* PnetCDF default fill mode is no fill */
-    fClr(ncp->flags, NC_MODE_FILL);
-    if (!fIsSet(omode, NC_WRITE)) fSet(ncp->flags, NC_MODE_RDONLY);
+    ncp->path     = path;  /* reuse path duplicated in dispatch layer */
+    ncp->pncio_fh = NULL;
+    ncp->iomode   = omode;
 
-    ncp->ncid = ncid;
+    ncp->collective_fh  = MPI_FILE_NULL;
+    ncp->independent_fh = MPI_FILE_NULL;
 
-    /* chunk size for reading header (set default before check hints) */
-    ncp->chunk = PNC_DEFAULT_CHUNKSIZE;
+    /* Setting file open mode in mpiomode which may later be needed in
+     * ncmpi_begin_indep_data() to open file for independent data mode.
+     */
+    mpiomode = fIsSet(omode, NC_WRITE) ? MPI_MODE_RDWR : MPI_MODE_RDONLY;
+    ncp->mpiomode = mpiomode;
 
-    /* buffer to pack noncontiguous user buffers when calling wait() */
-    ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE;
+    /* PnetCDF default fill mode is no fill */
+    fClr(ncp->flags, NC_MODE_FILL);
 
-    /* Extract PnetCDF specific I/O hints from user_info and set default hint
-     * values into info_used. Note some MPI libraries, such as MPICH 3.3.1 and
-     * priors fail to preserve user hints that are not recogniozed by the MPI
-     * libraries.
-     */
-    ncmpio_set_pnetcdf_hints(ncp, user_info, info_used);
-
-    ncp->iomode         = omode;
-    ncp->comm           = comm;  /* reuse comm duplicated in dispatch layer */
-    MPI_Comm_rank(comm, &ncp->rank);
-    MPI_Comm_size(comm, &ncp->nprocs);
-    ncp->mpiinfo        = info_used; /* is not MPI_INFO_NULL */
-    ncp->mpiomode       = mpiomode;
-    ncp->collective_fh  = fh;
-    ncp->independent_fh = (ncp->nprocs > 1) ? MPI_FILE_NULL : fh;
-    ncp->path = (char*) NCI_Malloc(strlen(path) + 1);
-    strcpy(ncp->path, path);
+    /* set read-only mode */
+    if (!fIsSet(omode, NC_WRITE)) fSet(ncp->flags, NC_MODE_RDONLY);
 
 #ifdef PNETCDF_DEBUG
     /* PNETCDF_DEBUG is set at configure time, which will be overwritten by
-     * the run-time environment variable PNETCDF_SAFE_MODE */
+     * the run-time environment variable PNETCDF_SAFE_MODE.
+     */
     ncp->safe_mode = 1;
 #endif
     /* If environment variable PNETCDF_SAFE_MODE is set to 1, then we perform
@@ -133,17 +132,174 @@ ncmpio_open(MPI_Comm     comm,
     if ((env_str = getenv("PNETCDF_SAFE_MODE")) != NULL) {
         if (*env_str == '0') ncp->safe_mode = 0;
         else                 ncp->safe_mode = 1;
-        /* if PNETCDF_SAFE_MODE is set but without a value, *env_str can
-         * be '\0' (null character). In this case, safe_mode is enabled */
+        /* If PNETCDF_SAFE_MODE is set but without a value, *env_str can
+         * be '\0' (null character). In this case, safe_mode is enabled.
+         */
+    }
+
+    /* Construct a list of unique IDs of compute nodes allocated to this job
+     * and save it in ncp->node_ids[nprocs], which contains node IDs of each
+     * rank. The node IDs are used either when intra-node aggregation is
+     * enabled or when using PnetCDF's PNCIO driver.
+     *
+     * When intra-node aggregation is enabled, node IDs are used to create a
+     * new MPI communicator consisting of the intra-node aggregators only. The
+     * communicator will be used to call file open in MPI-IO or PnetCDF's PNCIO
+     * driver. This means only intra-node aggregators will perform file I/O in
+     * PnetCDF collective put and get operations.
+     */
+    ncp->node_ids = NULL;
+    if (ncp->fstype != PNCIO_FSTYPE_MPIIO || ncp->num_aggrs_per_node != 0) {
+        err = ncmpii_construct_node_list(comm, &ncp->num_nodes, &ncp->node_ids);
+        if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err);
+
+        /* When the total number of aggregators >= number of processes, disable
+         * intra-node aggregation.
+         */
+        if (ncp->num_aggrs_per_node * ncp->num_nodes >= ncp->nprocs)
+            ncp->num_aggrs_per_node = 0;
+    }
+
+    /* ncp->num_aggrs_per_node = 0, or > 0 indicates whether this feature
+     * is disabled or enabled globally for all processes.
+     */
+    ncp->my_aggr = -1;
+    ncp->ina_comm = MPI_COMM_NULL;
+    ncp->ina_nprocs = 0;
+    ncp->ina_rank = -1;
+    ncp->ina_node_list = NULL;
+    if (ncp->num_aggrs_per_node > 0) {
+        /* Divide all ranks into groups. Each group is assigned with one
+         * intra-node aggregator. The following metadata related to intra-node
+         * aggregation will be set up.
+         * ncp->my_aggr is the aggregator's rank ID of this group. When ==
+         *     ncp->rank, this rank is an aggregator.
+         * ncp->num_nonaggrs is the number of non-aggregators assigned to this
+         *     rank (an aggregator)
+         * ncp->ina_comm will be created consisting of only intra-node
+         *     aggregators, which will be used when calling MPI_File_open().
+         *     For non-aggregator, ncp->ina_comm == MPI_COMM_NULL.
+         * ncp->node_ids[] will be modified to contain the nodes IDs of
+         *     intra-node aggregators only, which will be passed to pncio_fh.
+         */
+        err = ncmpio_ina_init(ncp);
+        if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err);
+
+        /* As non-aggregators will not perform any file I/O, we now can replace
+         * comm with ina_comm. Same for nprocs.
+         */
+        comm = ncp->ina_comm;
+        nprocs = ncp->ina_nprocs;
+
+        /* For non-aggregators, comm is MPI_COMM_NULL. As the remaining task of
+         * this subroutine is to open the file and obtain the file handler,
+         * non-aggregators can skip.
+         */
+        if (comm == MPI_COMM_NULL) {
+            MPI_Info_create(&ncp->mpiinfo);
+            goto fn_exit;
+        }
     }
 
+    /* open file collectively ---------------------------------------------- */
+    if (ncp->fstype == PNCIO_FSTYPE_MPIIO) {
+        TRACE_IO(MPI_File_open, (comm, path, mpiomode, user_info, &fh));
+        if (mpireturn != MPI_SUCCESS) {
+            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+            DEBUG_FOPEN_ERROR(err);
+        }
+
+        /* Now the file has been successfully opened */
+        ncp->collective_fh  = fh;
+        ncp->independent_fh = (nprocs > 1) ? MPI_FILE_NULL : fh;
+
+        /* get the I/O hints used/modified by MPI-IO */
+        TRACE_IO(MPI_File_get_info, (fh, &ncp->mpiinfo));
+        if (mpireturn != MPI_SUCCESS) {
+            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+            DEBUG_FOPEN_ERROR(err);
+        }
+    }
+    else {
+        /* When ncp->fstype != PNCIO_FSTYPE_MPIIO, use PnetCDF's PNCIO driver */
+        ncp->pncio_fh = (PNCIO_File*) NCI_Calloc(1,sizeof(PNCIO_File));
+        ncp->pncio_fh->file_system = ncp->fstype;
+        ncp->pncio_fh->num_nodes   = ncp->num_nodes;
+        ncp->pncio_fh->node_ids    = ncp->node_ids;
+
+        err = PNCIO_File_open(comm, filename, mpiomode, user_info,
+                              ncp->pncio_fh);
+        if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err);
+
+        /* Now the file has been successfully opened, obtain the I/O hints
+         * used/modified by PNCIO driver.
+         */
+        err = PNCIO_File_get_info(ncp->pncio_fh, &ncp->mpiinfo);
+        if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err);
+    }
+
+    /* Copy MPI-IO hints into ncp->mpiinfo */
+    ncmpio_hint_set(ncp, ncp->mpiinfo);
+
+fn_exit:
+    if (ncp->num_aggrs_per_node > 0) {
+        /* When intra-node aggregation is enabled, it is necessary to make sure
+         * non-aggregators obtain consistent values of file striping hints.
+         *
+         * non-aggregator do not have hints returned from MPI_File_get_info()
+         */
+        int striping_info[2];
+        if (ncp->rank == 0) {
+            MPI_Info_get(ncp->mpiinfo, "striping_unit", MPI_MAX_INFO_VAL-1,
+                         value, &flag);
+            striping_info[0] = 0;
+            if (flag) {
+                errno = 0;  /* errno must set to zero before calling strtoll */
+                striping_info[0] = (int)strtol(value,NULL,10);
+                if (errno != 0) striping_info[0] = 0;
+            }
+
+            MPI_Info_get(ncp->mpiinfo, "striping_factor", MPI_MAX_INFO_VAL-1,
+                         value, &flag);
+            striping_info[1] = 0;
+            if (flag) {
+                errno = 0;  /* errno must set to zero before calling strtoll */
+                striping_info[1] = (int)strtol(value,NULL,10);
+                if (errno != 0) striping_info[1] = 0;
+            }
+        }
+
+        MPI_Bcast(striping_info, 2, MPI_INT, 0, ncp->comm);
+
+        if (ncp->my_aggr != ncp->rank) {
+            sprintf(value, "%d", striping_info[0]);
+            MPI_Info_set(ncp->mpiinfo, "striping_unit", value);
+            sprintf(value, "%d", striping_info[1]);
+            MPI_Info_set(ncp->mpiinfo, "striping_factor", value);
+        }
+    }
+
+    /* ina_node_list is no longer needed */
+    if (ncp->ina_node_list != NULL) {
+        NCI_Free(ncp->ina_node_list);
+        ncp->ina_node_list = NULL;
+    }
+    /* node_ids is no longer needed */
+    if (ncp->node_ids != NULL) {
+        NCI_Free(ncp->node_ids);
+        ncp->node_ids = NULL;
+    }
+    if (ncp->pncio_fh != NULL)
+        ncp->pncio_fh->node_ids = NULL;
+
     /* read header from file into NC object pointed by ncp -------------------*/
     err = ncmpio_hdr_get_NC(ncp);
     if (err == NC_ENULLPAD) status = NC_ENULLPAD; /* non-fatal error */
     else if (err != NC_NOERR) { /* fatal error */
-        ncmpio_close_files(ncp, 0);
+        ncmpio_file_close(ncp);
+        if (ncp->ina_comm != MPI_COMM_NULL) MPI_Comm_free(&ncp->ina_comm);
         ncmpio_free_NC(ncp);
-        return err;
+        DEBUG_RETURN_ERROR(err);
     }
 
 #ifdef ENABLE_SUBFILING
@@ -152,29 +308,28 @@ ncmpio_open(MPI_Comm     comm,
         err = ncmpio_get_att(ncp, NC_GLOBAL, "_PnetCDF_SubFiling.num_subfiles",
                              &ncp->num_subfiles, MPI_INT);
         if (err == NC_NOERR && ncp->num_subfiles > 1) {
-            int i;
             /* ignore error NC_ENOTATT if this attribute is not defined */
             for (i=0; i<ncp->vars.ndefined; i++) {
                 /* variables may have different numbers of subfiles */
                 err = ncmpio_get_att(ncp, i, "_PnetCDF_SubFiling.num_subfiles",
                              &ncp->vars.value[i]->num_subfiles,MPI_INT);
                 if (err == NC_ENOTATT) continue;
-                if (err != NC_NOERR) return err;
+                if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err);
                 if (ncp->vars.value[i]->num_subfiles > 1) {
                     /* find the orginal ndims of variable i */
                     err = ncmpio_get_att(ncp,i,"_PnetCDF_SubFiling.ndims_org",
                                  &ncp->vars.value[i]->ndims_org,MPI_INT);
-                    if (err != NC_NOERR) return err;
+                    if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err);
                     ncp->vars.value[i]->dimids_org = (int*) NCI_Malloc(
                               ncp->vars.value[i]->ndims_org * SIZEOF_INT);
                     err = ncmpio_get_att(ncp,i,"_PnetCDF_SubFiling.dimids_org",
                               ncp->vars.value[i]->dimids_org, MPI_INT);
-                    if (err != NC_NOERR) return err;
+                    if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err);
                 }
             }
             /* open subfile */
             err = ncmpio_subfile_open(ncp);
-            if (err != NC_NOERR) return err;
+            if (err != NC_NOERR) DEBUG_FOPEN_ERROR(err);
         }
         else ncp->num_subfiles = 0;
     }
@@ -191,21 +346,6 @@ ncmpio_open(MPI_Comm     comm,
         ncp->vars.value[i]->attrs.hash_size = ncp->hash_size_attr;
 #endif
 
-    /* determine whether to enable intra-node aggregation and set up all
-     * intra-node aggregation metadata.
-     * ncp->num_aggrs_per_node = 0, or non-zero indicates whether this feature
-     *     is enabled globally for all processes.
-     * ncp->my_aggr = -1 or >= 0 indicates whether aggregation is effectively
-     *     enabled for the aggregation group of this process.
-     */
-    ncp->my_aggr = -1;
-    if (ncp->num_aggrs_per_node != 0) {
-        err = ncmpio_intra_node_aggr_init(ncp);
-        if (err != NC_NOERR) return err;
-    }
-
-    *ncpp = (void*)ncp;
-
     return status;
 }
 
diff --git a/src/drivers/ncmpio/ncmpio_subfile.c b/src/drivers/ncmpio/ncmpio_subfile.c
index e1be70ec7..3bfb0bef6 100644
--- a/src/drivers/ncmpio/ncmpio_subfile.c
+++ b/src/drivers/ncmpio/ncmpio_subfile.c
@@ -315,7 +315,7 @@ int ncmpio_subfile_partition(NC *ncp)
         if (dpp[vpp[i]->dimids[par_dim_id]]->size/ncp->num_subfiles > 0 &&
             vpp[i]->ndims >= par_dim_id+1 &&
             vpp[i]->ndims >= SUBFILING_MIN_NDIMS) {
-            int varid, j, jj, k;
+            int varid, jj, k;
             int var_ndims = vpp[i]->ndims; /* keep org ndims */
             int dimids[var_ndims];
             char *key[ncp->num_subfiles][var_ndims];
@@ -1003,7 +1003,6 @@ ncmpio_subfile_getput_vars(NC               *ncp,
     for (i=0; i<nprocs; i++) {
         buf_count_my[i] = 1;
         if (count_my_req_per_proc[i] != 0 && i != myrank) {
-            MPI_Offset diff[ndims_org];
             for (k=0; k < ndims_org; k++) {
                 int l;
                 MPI_Offset stride_count, tmp=1;
diff --git a/src/drivers/ncmpio/ncmpio_sync.c b/src/drivers/ncmpio/ncmpio_sync.c
index 06490db43..a53f8360b 100644
--- a/src/drivers/ncmpio/ncmpio_sync.c
+++ b/src/drivers/ncmpio/ncmpio_sync.c
@@ -27,40 +27,10 @@
 #include <ncx.h>
 #include "ncmpio_NC.h"
 
-/*----< ncmpio_file_sync() >-------------------------------------------------*/
-/* This function must be called collectively, no matter if it is in collective
- * or independent data mode.
- */
-int
-ncmpio_file_sync(NC *ncp) {
-    char *mpi_name;
-    int mpireturn;
-
-    if (ncp->independent_fh != MPI_FILE_NULL) {
-        TRACE_IO(MPI_File_sync, (ncp->independent_fh));
-        if (mpireturn != MPI_SUCCESS)
-            return ncmpii_error_mpi2nc(mpireturn, mpi_name);
-    }
-    /* when nprocs == 1, ncp->collective_fh == ncp->independent_fh */
-    if (ncp->nprocs == 1) return NC_NOERR;
-
-    /* ncp->collective_fh is never MPI_FILE_NULL as collective mode is
-     * default in PnetCDF */
-    TRACE_IO(MPI_File_sync, (ncp->collective_fh));
-    if (mpireturn != MPI_SUCCESS)
-        return ncmpii_error_mpi2nc(mpireturn, mpi_name);
-
-    /* Barrier is not necessary ...
-    TRACE_COMM(MPI_Barrier)(ncp->comm);
-     */
-
-    return NC_NOERR;
-}
-
 #define NC_NUMRECS_OFFSET 4
 
 /*----< ncmpio_write_numrecs() >---------------------------------------------*/
-/* root process writes the new record number into file.
+/* Only root process writes the new record number into file.
  * This function is called by:
  * 1. ncmpio_sync_numrecs
  * 2. collective nonblocking wait API, if the new number of records is bigger
@@ -69,32 +39,42 @@ int
 ncmpio_write_numrecs(NC         *ncp,
                      MPI_Offset  new_numrecs)
 {
-    char *mpi_name;
-    int mpireturn, err;
-    MPI_File fh;
-    MPI_Status mpistatus;
+    int err=NC_NOERR;
+    PNCIO_View buf_view;
 
-    if (!fIsSet(ncp->flags, NC_HCOLL) && ncp->rank > 0)
-        /* Only root process writes numrecs in file */
-        return NC_NOERR;
+    buf_view.type = MPI_BYTE;
+    buf_view.size = 0;
+    buf_view.count = 1;
+    buf_view.is_contig = 1;
 
-    /* return now if there is no record variabled defined */
+    /* return now if there is no record variable defined */
     if (ncp->vars.num_rec_vars == 0) return NC_NOERR;
 
-    fh = ncp->independent_fh;
-    if (ncp->nprocs > 1 && !NC_indep(ncp))
-        fh = ncp->collective_fh;
+    /* When intra-node aggregation is enabled, non-aggregators do not
+     * participate any collective calls below.
+     */
+    if (ncp->num_aggrs_per_node > 0 && ncp->rank != ncp->my_aggr)
+        return NC_NOERR;
+
+    /* If not requiring all MPI-IO calls to be collective, non-root processes
+     * can return now. This is because only root process writes numrecs to the
+     * file header.
+     */
+    if (!fIsSet(ncp->flags, NC_HCOLL) && ncp->rank > 0)
+        return NC_NOERR;
 
+    /* If collective MPI-IO is required for all MPI-IO calls, then all non-root
+     * processes participate the collective write call with zero-size requests.
+     */
     if (ncp->rank > 0 && fIsSet(ncp->flags, NC_HCOLL)) {
-        /* other processes participate the collective call */
-        TRACE_IO(MPI_File_write_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus));
-        return (mpireturn == MPI_SUCCESS) ? NC_NOERR :
-               ncmpii_error_mpi2nc(mpireturn, mpi_name);
+        ncmpio_file_write_at_all(ncp, 0, NULL, buf_view);
+        return NC_NOERR;
     }
 
     if (new_numrecs > ncp->numrecs || NC_ndirty(ncp)) {
         int len;
         char pos[8], *buf=pos;
+        MPI_Offset wlen;
 
         /* update ncp->numrecs */
         if (new_numrecs > ncp->numrecs) ncp->numrecs = new_numrecs;
@@ -113,41 +93,32 @@ ncmpio_write_numrecs(NC         *ncp,
         }
         /* ncmpix_put_xxx advances the 1st argument with size len */
 
-        /* explicitly initialize mpistatus object to 0. For zero-length read,
-         * MPI_Get_count may report incorrect result for some MPICH version,
-         * due to the uninitialized MPI_Status object passed to MPI-IO calls.
-         * Thus we initialize it above to work around.
-         */
-        memset(&mpistatus, 0, sizeof(MPI_Status));
-
-        /* root's file view always includes the entire file header */
-        if (fIsSet(ncp->flags, NC_HCOLL) && ncp->nprocs > 1) {
-            TRACE_IO(MPI_File_write_at_all, (fh, NC_NUMRECS_OFFSET, (void*)pos,
-                                             len, MPI_BYTE, &mpistatus));
-        }
-        else {
-            TRACE_IO(MPI_File_write_at, (fh, NC_NUMRECS_OFFSET, (void*)pos,
-                                         len, MPI_BYTE, &mpistatus));
-        }
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-            if (err == NC_EFILE) DEBUG_RETURN_ERROR(NC_EWRITE)
-        }
-        else {
-            /* update the number of bytes written since file open.
-             * Because the above MPI write writes either 4 or 8 bytes,
-             * calling MPI_Get_count() is sufficient. No need to call
-             * MPI_Get_count_c()
+        if (ncp->num_aggrs_per_node > 0 && ncp->rank != ncp->my_aggr)
+            /* When intra-node aggregation is enabled, non-aggregators do not
+             * participate the collective call.
              */
-            int put_size;
-            mpireturn = MPI_Get_count(&mpistatus, MPI_BYTE, &put_size);
-            if (mpireturn != MPI_SUCCESS || put_size == MPI_UNDEFINED)
-                ncp->put_size += len;
-            else
-                ncp->put_size += put_size;
+            return NC_NOERR;
+
+        if (ncp->fstype != PNCIO_FSTYPE_MPIIO) {
+            /* reset fileview */
+            err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL);
+            if (err != NC_NOERR) DEBUG_RETURN_ERROR(err)
         }
+
+// printf("%s at %d: new_numrecs=%lld NC_NUMRECS_OFFSET=%d\n",__func__,__LINE__,new_numrecs,NC_NUMRECS_OFFSET);
+        buf_view.size = len;
+
+        /* root's file view always includes the entire file header */
+        if (fIsSet(ncp->flags, NC_HCOLL) && ncp->nprocs > 1)
+            wlen = ncmpio_file_write_at_all(ncp, NC_NUMRECS_OFFSET, (void*)pos,
+                                            buf_view);
+        else
+            wlen = ncmpio_file_write_at(ncp, NC_NUMRECS_OFFSET, (void*)pos,
+                                        buf_view);
+        if (wlen < 0)
+            DEBUG_RETURN_ERROR((int)wlen)
     }
-    return NC_NOERR;
+    return err;
 }
 
 /*----< ncmpio_sync_numrecs() >-----------------------------------------------*/
@@ -199,6 +170,7 @@ ncmpio_sync_numrecs(void *ncdp)
             return ncmpii_error_mpi2nc(mpireturn, "MPI_Allreduce");
     }
 
+// printf("%s at %d: max_numrecs=%lld\n",__func__,__LINE__,max_numrecs);
     /* root process writes max_numrecs to file */
     status = ncmpio_write_numrecs(ncp, max_numrecs);
 
diff --git a/src/drivers/ncmpio/ncmpio_util.c b/src/drivers/ncmpio/ncmpio_util.c
index 8034f9f0b..0223977ce 100644
--- a/src/drivers/ncmpio/ncmpio_util.c
+++ b/src/drivers/ncmpio/ncmpio_util.c
@@ -18,267 +18,320 @@
 
 #include <pnc_debug.h>
 #include <common.h>
+#include <ncx.h>
 #include "ncmpio_NC.h"
 
-/*----< ncmpio_set_pnetcdf_hints() >-----------------------------------------*/
-/* this is where the I/O hints designated to pnetcdf are extracted and their
- * default values are set.
+#define MAX_INT_LEN 24
+
+/*----< ncmpio_hint_extract() >----------------------------------------------*/
+/* Extract hints from info. Argument info is the info object set by application
+ * user and passed to ncmpi_create() or ncmpi_open(). For those PnetCDF hints
+ * are not set in info, their default values are used.
  */
-void ncmpio_set_pnetcdf_hints(NC *ncp,
-                              MPI_Info user_info,
-                              MPI_Info info_used)
+void ncmpio_hint_extract(NC       *ncp,
+                         MPI_Info  info)
 {
     char value[MPI_MAX_INFO_VAL];
-    int  flag;
+    int  flag, ival;
+    long long llval;
 
-    if (user_info == MPI_INFO_NULL) flag = 0;
+    assert(ncp != NULL);
 
-    /* Note info_used cannot be MPI_INFO_NULL, as it is returned from a call to
-     * MPI_File_get_info()
-     */
-    assert(info_used != MPI_INFO_NULL);
+    ncp->info_v_align = -1; /* -1 indicates not set */
+    ncp->info_r_align = -1; /* -1 indicates not set */
+
+    /* chunk size for reading header (set default before check hints) */
+    ncp->chunk = PNC_DEFAULT_CHUNKSIZE;
+
+    /* buffer to pack noncontiguous user buffers when calling wait() */
+    ncp->ibuf_size = PNC_DEFAULT_IBUF_SIZE;
+
+#ifdef ENABLE_SUBFILING
+    ncp->subfile_mode = 0;
+    ncp->num_subfiles = 0;
+#endif
+
+    ncp->dims.hash_size  = PNC_HSIZE_DIM;
+    ncp->vars.hash_size  = PNC_HSIZE_VAR;
+    ncp->attrs.hash_size = PNC_HSIZE_GATTR;
+    ncp->hash_size_attr  = PNC_HSIZE_VATTR;
+
+    /* number of INA aggregators per compute node */
+    ncp->num_aggrs_per_node = 0;
+
+    /* file system type */
+    ncp->fstype = PNCIO_FSTYPE_CHECK;
+
+    if (info == MPI_INFO_NULL) return;
 
     /* nc_var_align_size, and r_align take effect when a file is created, or
      * opened and later adding more metadata or variable data
      */
 
-    ncp->info_v_align = -1; /* -1 indicates not set */
-    if (user_info != MPI_INFO_NULL) {
-        /* aligns starting file offsets of entire data section */
-        MPI_Info_get(user_info, "nc_var_align_size", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
-        if (flag) {
-            errno = 0;  /* errno must set to zero before calling strtoll */
-            ncp->info_v_align = strtoll(value, NULL, 10);
-            if (errno != 0) ncp->info_v_align = -1;
-            else if (ncp->info_v_align < 0) ncp->info_v_align = -1;
-        }
+    /* aligns starting file offsets of entire data section */
+    MPI_Info_get(info, "nc_var_align_size", MPI_MAX_INFO_VAL-1, value, &flag);
+    if (flag) {
+        errno = 0;  /* errno must set to zero before calling strtoll */
+        llval = strtoll(value, NULL, 10);
+        if (errno == 0 && llval >= 0)
+            ncp->info_v_align = llval;
     }
-    if (ncp->info_v_align == -1)
-        sprintf(value, "%d", FILE_ALIGNMENT_DEFAULT);
-    else
-        sprintf(value, OFFFMT, ncp->info_v_align);
-    MPI_Info_set(info_used, "nc_var_align_size", value);
 
-    if (user_info != MPI_INFO_NULL) {
-        /* Hint nc_header_align_size is now deprecated. But for backward
-         * compatibility, let's still check.
-         */
-        MPI_Offset info_h_align = -1;
-        MPI_Info_get(user_info, "nc_header_align_size", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
-        if (flag) {
-            errno = 0;  /* errno must set to zero before calling strtoll */
-            info_h_align = strtoll(value, NULL, 10);
-            if (errno != 0) info_h_align = -1;
-            else if (info_h_align < 0) info_h_align = -1;
-        }
-        /* if nc_header_align_size is set and nc_var_align_size is not set,
-         * replace hint nc_var_align_size with the value of info_h_align.
-         */
-        if (info_h_align >= 0 && ncp->info_v_align == -1) {
-            ncp->info_v_align = info_h_align;
-            sprintf(value, OFFFMT, ncp->info_v_align);
-            MPI_Info_set(info_used, "nc_var_align_size", value);
+    /* Hint nc_header_align_size is now deprecated. But for backward
+     * compatibility, let's still check.
+     */
+    MPI_Info_get(info, "nc_header_align_size", MPI_MAX_INFO_VAL-1,
+                 value, &flag);
+    if (flag) {
+        errno = 0;  /* errno must set to zero before calling strtoll */
+        llval = strtoll(value, NULL, 10);
+        if (errno == 0 && llval >= 0) {
+            /* if nc_header_align_size is set and nc_var_align_size is not set,
+             * replace hint nc_var_align_size with the value of info_h_align.
+             */
+            if (llval >= 0 && ncp->info_v_align == -1)
+                ncp->info_v_align = llval;;
         }
     }
 
-    ncp->info_r_align = -1;
-    if (user_info != MPI_INFO_NULL) {
-        /* aligns starting file offset of the record variable section */
-        MPI_Info_get(user_info, "nc_record_align_size", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
-        if (flag) {
-            errno = 0;  /* errno must set to zero before calling strtoll */
-            ncp->info_r_align = strtoll(value, NULL, 10);
-            if (errno != 0) ncp->info_r_align = -1;
-            else if (ncp->info_r_align < 0) ncp->info_r_align = -1;
-        }
+    /* aligns starting file offset of the record variable section */
+    MPI_Info_get(info, "nc_record_align_size", MPI_MAX_INFO_VAL-1,
+                 value, &flag);
+    if (flag) {
+        errno = 0;  /* errno must set to zero before calling strtoll */
+        llval = strtoll(value, NULL, 10);
+        if (errno == 0 && llval >= 0)
+            ncp->info_r_align = llval;
     }
-    if (ncp->info_r_align == -1)
-        sprintf(value, "%d", FILE_ALIGNMENT_DEFAULT);
-    else
-        sprintf(value, OFFFMT, ncp->info_r_align);
-    MPI_Info_set(info_used, "nc_record_align_size", value);
 
-    ncp->chunk = PNC_DEFAULT_CHUNKSIZE;
-    if (user_info != MPI_INFO_NULL) {
-        /* header reading chunk size */
-        MPI_Info_get(user_info, "nc_header_read_chunk_size", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
-        if (flag) {
-            int chunk;
-            errno = 0;  /* errno must set to zero before calling strtoll */
-            chunk = atoi(value);
-            if (errno != 0) ncp->chunk = 0;
-            else if (ncp->chunk < 0)
+    /* header reading chunk size */
+    MPI_Info_get(info, "nc_header_read_chunk_size", MPI_MAX_INFO_VAL-1,
+                 value, &flag);
+    if (flag) {
+        errno = 0;  /* errno must set to zero before calling strtoll */
+        llval = strtoll(value, NULL, 10);
+        if (errno == 0) {
+            if (llval < 0)
                 ncp->chunk = 0;
-            else if (chunk > NC_MAX_INT) /* limit to NC_MAX_INT */
+            else if (llval > NC_MAX_INT) /* limit to NC_MAX_INT */
                 ncp->chunk = NC_MAX_INT;
+            else
+                ncp->chunk = (int)llval;
+
+            /* CDF-5's minimum header size is 4 bytes more than CDF-1 2's */
+            ncp->chunk = PNETCDF_RNDUP(MAX(MIN_NC_XSZ+4, ncp->chunk), X_ALIGN);
         }
     }
-    sprintf(value, "%d", ncp->chunk);
-    MPI_Info_set(info_used, "nc_header_read_chunk_size", value);
 
-    strcpy(value, "auto");
-    if (user_info != MPI_INFO_NULL) {
-        /* setting in-place byte swap (matters only for Little Endian) */
-        MPI_Info_get(user_info, "nc_in_place_swap", MPI_MAX_INFO_VAL-1, value, &flag);
-        if (flag) {
-            if (strcasecmp(value, "enable") == 0) {
-                fClr(ncp->flags, NC_MODE_SWAP_OFF);
-                fSet(ncp->flags, NC_MODE_SWAP_ON);
-            }
-            else if (strcasecmp(value, "disable") == 0) {
-                fClr(ncp->flags, NC_MODE_SWAP_ON);
-                fSet(ncp->flags, NC_MODE_SWAP_OFF);
-            }
-            else if (strcasecmp(value, "auto") == 0) {
-                fClr(ncp->flags, NC_MODE_SWAP_ON);
-                fClr(ncp->flags, NC_MODE_SWAP_OFF);
-            }
+    /* setting in-place byte swap (matters only for Little Endian) */
+    MPI_Info_get(info, "nc_in_place_swap", MPI_MAX_INFO_VAL-1, value, &flag);
+    if (flag) {
+        if (strcasecmp(value, "enable") == 0) {
+            fClr(ncp->flags, NC_MODE_SWAP_OFF);
+            fSet(ncp->flags, NC_MODE_SWAP_ON);
+        }
+        else if (strcasecmp(value, "disable") == 0) {
+            fClr(ncp->flags, NC_MODE_SWAP_ON);
+            fSet(ncp->flags, NC_MODE_SWAP_OFF);
+        }
+        else if (strcasecmp(value, "auto") == 0) {
+            fClr(ncp->flags, NC_MODE_SWAP_ON);
+            fClr(ncp->flags, NC_MODE_SWAP_OFF);
         }
     }
-    MPI_Info_set(info_used, "nc_in_place_swap", value);
 
-    if (user_info != MPI_INFO_NULL) {
-	/* temporal buffer size used to pack noncontiguous aggregated user
-         * buffers when calling ncmpi_wait/wait_all, Default 16 MiB
-         */
-        MPI_Info_get(user_info, "nc_ibuf_size", MPI_MAX_INFO_VAL-1, value,
-                     &flag);
-        if (flag) {
-            MPI_Offset ibuf_size;
-            errno = 0;  /* errno must set to zero before calling strtoll */
-            ibuf_size = strtoll(value, NULL, 10);
-            if (errno == 0 && ibuf_size >= 0) ncp->ibuf_size = ibuf_size;
-        }
+    /* Temporal buffer size used to pack non-contiguous aggregated user buffers
+     * when calling ncmpi_wait/wait_all. Default PNC_DEFAULT_IBUF_SIZE.
+     */
+    MPI_Info_get(info, "nc_ibuf_size", MPI_MAX_INFO_VAL-1, value, &flag);
+    if (flag) {
+        errno = 0;  /* errno must set to zero before calling strtoll */
+        llval = strtoll(value, NULL, 10);
+        if (errno == 0 && llval >= 0)
+            ncp->ibuf_size = llval;
     }
-    sprintf(value, OFFFMT, ncp->ibuf_size);
-    MPI_Info_set(info_used, "nc_ibuf_size", value);
 
 #ifdef ENABLE_SUBFILING
-    ncp->subfile_mode = 0;
-    if (user_info != MPI_INFO_NULL) {
-        MPI_Info_get(user_info, "pnetcdf_subfiling", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
-        if (flag) {
-            if (strcasecmp(value, "enable") == 0)
-                ncp->subfile_mode = 1;
+    MPI_Info_get(info, "pnetcdf_subfiling", MPI_MAX_INFO_VAL-1, value, &flag);
+    if (flag) {
+        if (strcasecmp(value, "enable") == 0)
+            ncp->subfile_mode = 1;
+        else {
+            ncp->subfile_mode = 0;
+            ncp->num_subfiles = 0;
         }
     }
-    if (ncp->subfile_mode)
-        MPI_Info_set(info_used, "pnetcdf_subfiling", "enable");
-    else
-        MPI_Info_set(info_used, "pnetcdf_subfiling", "disable");
 
-    ncp->num_subfiles = 0;
-    if (user_info != MPI_INFO_NULL) {
-        MPI_Info_get(user_info, "nc_num_subfiles", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
+    if (ncp->subfile_mode == 1) {
+        MPI_Info_get(info, "nc_num_subfiles", MPI_MAX_INFO_VAL-1, value, &flag);
         if (flag) {
-            errno = 0;
-            ncp->num_subfiles = atoi(value);
-            if (errno != 0) ncp->num_subfiles = 0;
-            else if (ncp->num_subfiles < 0) ncp->num_subfiles = 0;
+            errno = 0;  /* errno must set to zero before calling atoi */
+            ival = atoi(value);
+            if (errno == 0 && ival >= 0)
+                ncp->num_subfiles = ival;
         }
     }
-    sprintf(value, "%d", ncp->num_subfiles);
-    MPI_Info_set(info_used, "nc_num_subfiles", value);
-
-    if (ncp->subfile_mode == 0) ncp->num_subfiles = 0;
-#else
-    MPI_Info_set(info_used, "pnetcdf_subfiling", "disable");
-    MPI_Info_set(info_used, "nc_num_subfiles", "0");
 #endif
 
-    if (user_info != MPI_INFO_NULL) {
-        /* If romio_no_indep_rw is set to true, let all processes participate
-         * the read/write file header using MPI collective APIs, where only
-         * rank 0 has non-zero request count.
-         */
-        MPI_Info_get(user_info, "romio_no_indep_rw", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
-        if (flag) {
-            if (strcasecmp(value, "true") == 0)
-                fSet((ncp)->flags, NC_HCOLL);
-        }
+    /* Hash table size for dimensions */
+    MPI_Info_get(info, "nc_hash_size_dim", MPI_MAX_INFO_VAL-1, value, &flag);
+    if (flag) {
+        errno = 0;  /* errno must set to zero before calling atoi */
+        ival = atoi(value);
+        if (errno == 0 && ival >= 0)
+            ncp->dims.hash_size = ival;
     }
 
-    ncp->dims.hash_size = PNC_HSIZE_DIM;
-    if (user_info != MPI_INFO_NULL) {
-        /* Hash table size for dimensions */
-        MPI_Info_get(user_info, "nc_hash_size_dim", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
-        if (flag) {
-            errno = 0;  /* errno must set to zero before calling atoi */
-            ncp->dims.hash_size = atoi(value);
-            if (errno != 0 || ncp->dims.hash_size < 0)
-                ncp->dims.hash_size = PNC_HSIZE_DIM;
-        }
+    /* Hash table size for variables */
+    MPI_Info_get(info, "nc_hash_size_var", MPI_MAX_INFO_VAL-1, value, &flag);
+    if (flag) {
+        errno = 0;  /* errno must set to zero before calling atoi */
+        ival = atoi(value);
+        if (errno == 0 && ival >= 0)
+            ncp->vars.hash_size = ival;
     }
-    sprintf(value, "%d", ncp->dims.hash_size);
-    MPI_Info_set(info_used, "nc_hash_size_dim", value);
 
-    ncp->vars.hash_size = PNC_HSIZE_VAR;
-    if (user_info != MPI_INFO_NULL) {
-        /* Hash table size for variables */
-        MPI_Info_get(user_info, "nc_hash_size_var", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
-        if (flag) {
-            errno = 0;  /* errno must set to zero before calling atoi */
-            ncp->vars.hash_size = atoi(value);
-            if (errno != 0 || ncp->vars.hash_size < 0)
-                ncp->vars.hash_size = PNC_HSIZE_VAR;
-        }
+    /* Hash table size for global attributes */
+    MPI_Info_get(info, "nc_hash_size_gattr", MPI_MAX_INFO_VAL-1, value, &flag);
+    if (flag) {
+        errno = 0;  /* errno must set to zero before calling atoi */
+        ival = atoi(value);
+        if (errno == 0 && ival >= 0)
+            ncp->attrs.hash_size = ival;
     }
-    sprintf(value, "%d", ncp->vars.hash_size);
-    MPI_Info_set(info_used, "nc_hash_size_var", value);
 
-    ncp->attrs.hash_size = PNC_HSIZE_GATTR;
-    if (user_info != MPI_INFO_NULL) {
-        /* Hash table size for global attributes */
-        MPI_Info_get(user_info, "nc_hash_size_gattr", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
-        if (flag) {
-            errno = 0;  /* errno must set to zero before calling atoi */
-            ncp->attrs.hash_size = atoi(value);
-            if (errno != 0 || ncp->attrs.hash_size < 0)
-                ncp->attrs.hash_size = PNC_HSIZE_GATTR;
-        }
+    /* Hash table size for non-global attributes */
+    MPI_Info_get(info, "nc_hash_size_vattr", MPI_MAX_INFO_VAL-1, value, &flag);
+    if (flag) {
+        errno = 0;  /* errno must set to zero before calling atoi */
+        ival = atoi(value);
+        if (errno == 0 && ival >= 0)
+            ncp->hash_size_attr = ival;
     }
-    sprintf(value, "%d", ncp->attrs.hash_size);
-    MPI_Info_set(info_used, "nc_hash_size_gattr", value);
 
-    ncp->hash_size_attr = PNC_HSIZE_VATTR;
-    if (user_info != MPI_INFO_NULL) {
-        /* Hash table size for non-global attributes */
-        MPI_Info_get(user_info, "nc_hash_size_vattr", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
+    /* Number of intra-node aggregators per compute node. */
+    if (ncp->nprocs > 1) {
+        MPI_Info_get(info, "nc_num_aggrs_per_node", MPI_MAX_INFO_VAL-1, value,
+                     &flag);
         if (flag) {
             errno = 0;  /* errno must set to zero before calling atoi */
-            ncp->hash_size_attr = atoi(value);
-            if (errno != 0 || ncp->hash_size_attr < 0)
-                ncp->hash_size_attr = PNC_HSIZE_VATTR;
+            ival = atoi(value);
+            if (errno == 0 && ival >= 0)
+                ncp->num_aggrs_per_node = ival;
         }
     }
-    sprintf(value, "%d", ncp->hash_size_attr);
-    MPI_Info_set(info_used, "nc_hash_size_vattr", value);
 
-    ncp->num_aggrs_per_node = 0;
-    if (user_info != MPI_INFO_NULL) {
-        /* Hash table size for non-global attributes */
-        MPI_Info_get(user_info, "nc_num_aggrs_per_node", MPI_MAX_INFO_VAL-1,
-                     value, &flag);
-        if (flag) {
-            errno = 0;  /* errno must set to zero before calling atoi */
-            ncp->num_aggrs_per_node = atoi(value);
-            if (errno != 0 || ncp->num_aggrs_per_node < 0)
-                ncp->num_aggrs_per_node = 0;
+    /* If user explicitly want to use MPI-IO instead of PnetCDF's internal PNCIO
+     * driver, then set PnetCDF I/O hint "nc_pncio" to "disable".
+     */
+    MPI_Info_get(info, "nc_pncio", MPI_MAX_INFO_VAL-1, value, &flag);
+    if (flag && strcasecmp(value, "disable") == 0)
+        ncp->fstype = PNCIO_FSTYPE_MPIIO;
+}
+
+/*----< ncmpio_hint_set() >--------------------------------------------------*/
+/* Insert PnetCDF hints into info. Argument info is the info object returned
+ * from an earlier call to MPI_File_get_info().
+ */
+void ncmpio_hint_set(NC       *ncp,
+                     MPI_Info  info)
+{
+    char int_str[MAX_INT_LEN];
+
+    assert(ncp != NULL);
+    assert(info != MPI_INFO_NULL);
+
+    /* nc_var_align_size, and r_align take effect when a file is created, or
+     * opened and later adding more metadata or variable data
+     */
+
+    /* aligns starting file offsets of entire data section */
+    if (ncp->info_v_align != -1) {
+        snprintf(int_str, MAX_INT_LEN, OFFFMT, ncp->info_v_align);
+        MPI_Info_set(info, "nc_var_align_size", int_str);
+    }
+
+    /* aligns starting file offset of the record variable section */
+    if (ncp->info_r_align != -1) {
+        snprintf(int_str, MAX_INT_LEN, OFFFMT, ncp->info_r_align);
+        MPI_Info_set(info, "nc_record_align_size", int_str);
+    }
+
+    /* header reading chunk size */
+    snprintf(int_str, MAX_INT_LEN, "%d", ncp->chunk);
+    MPI_Info_set(info, "nc_header_read_chunk_size", int_str);
+
+    /* setting in-place byte swap (matters only for Little Endian) */
+    int swap_on  = fIsSet(ncp->flags, NC_MODE_SWAP_ON);
+    int swap_off = fIsSet(ncp->flags, NC_MODE_SWAP_OFF);
+    if (!swap_on && !swap_off)
+        MPI_Info_set(info, "nc_in_place_swap", "auto");
+    else if (swap_on)
+        MPI_Info_set(info, "nc_in_place_swap", "enable");
+    else
+        MPI_Info_set(info, "nc_in_place_swap", "disable");
+
+    /* Temporal buffer size used to pack non-contiguous aggregated user buffers
+     * when calling ncmpi_wait/wait_all. Default PNC_DEFAULT_IBUF_SIZE.
+     */
+    snprintf(int_str, MAX_INT_LEN, OFFFMT, ncp->ibuf_size);
+    MPI_Info_set(info, "nc_ibuf_size", int_str);
+
+#ifdef ENABLE_SUBFILING
+    if (ncp->subfile_mode)
+        MPI_Info_set(info, "pnetcdf_subfiling", "enable");
+    else
+        MPI_Info_set(info, "pnetcdf_subfiling", "disable");
+
+    snprintf(int_str, MAX_INT_LEN, "%d", ncp->num_subfiles);
+    MPI_Info_set(info, "nc_num_subfiles", int_str);
+#endif
+
+    /* Hash table size for dimensions */
+    snprintf(int_str, MAX_INT_LEN, "%d", ncp->dims.hash_size);
+    MPI_Info_set(info, "nc_hash_size_dim", int_str);
+
+    /* Hash table size for variables */
+    snprintf(int_str, MAX_INT_LEN, "%d", ncp->vars.hash_size);
+    MPI_Info_set(info, "nc_hash_size_var", int_str);
+
+    /* Hash table size for global attributes */
+    snprintf(int_str, MAX_INT_LEN, "%d", ncp->attrs.hash_size);
+    MPI_Info_set(info, "nc_hash_size_gattr", int_str);
+
+    /* Hash table size for non-global attributes */
+    snprintf(int_str, MAX_INT_LEN, "%d", ncp->hash_size_attr);
+    MPI_Info_set(info, "nc_hash_size_vattr", int_str);
+
+    /* Whether using MPI-IO instead of PnetCDF's internal PNCIO driver. */
+    if (ncp->fstype == PNCIO_FSTYPE_MPIIO)
+        MPI_Info_set(info, "nc_pncio", "disable");
+    else
+        MPI_Info_set(info, "nc_pncio", "enable");
+
+    if (ncp->num_aggrs_per_node > 0) {
+        /* Number of intra-node aggregators per compute node. */
+        snprintf(int_str, MAX_INT_LEN, "%d", ncp->num_aggrs_per_node);
+        MPI_Info_set(info, "nc_num_aggrs_per_node", int_str);
+
+        /* Add hint "ina_node_list", list of INA aggregators' rank IDs */
+        if (ncp->ina_node_list != NULL) {
+            char value[MPI_MAX_INFO_VAL];
+            int i;
+            snprintf(value, MAX_INT_LEN, "%d", ncp->ina_node_list[0]);
+            for (i=1; i<ncp->ina_nprocs; i++) {
+                snprintf(int_str, sizeof(int_str), " %d", ncp->ina_node_list[i]);
+                if (strlen(value) + strlen(int_str) >= MPI_MAX_INFO_VAL-5) {
+                    strcat(value, " ...");
+                    break;
+                }
+                strcat(value, int_str);
+            }
+            MPI_Info_set(info, "nc_ina_node_list", value);
         }
     }
-    sprintf(value, "%d", ncp->num_aggrs_per_node);
-    MPI_Info_set(info_used, "nc_num_aggrs_per_node", value);
+    else /* Update hint "num_aggrs_per_node" to indicate disabled. */
+        MPI_Info_set(info, "nc_num_aggrs_per_node", "0");
 }
 
 /*----< ncmpio_first_offset() >-----------------------------------------------*/
@@ -730,12 +783,12 @@ ncmpio_unpack_xbuf(int           fmt,   /* NC_FORMAT_CDF2 NC_FORMAT_CDF5 etc. */
                 break;
         }
         /* The only error codes returned from the above switch block are
-	 * NC_EBADTYPE or NC_ERANGE. Bad varp->xtype and itype have been sanity
-	 * checked at the dispatchers, so NC_EBADTYPE is not possible. Thus,
-	 * the only possible error is NC_ERANGE.  NC_ERANGE can be caused by
-	 * one or more elements of buf that is out of range representable by
-	 * the external data type, it is not considered a fatal error. This
-	 * request must continue to finish.
+	     * NC_EBADTYPE or NC_ERANGE. Bad varp->xtype and itype have been sanity
+	     * checked at the dispatchers, so NC_EBADTYPE is not possible. Thus,
+	     * the only possible error is NC_ERANGE.  NC_ERANGE can be caused by
+	     * one or more elements of buf that is out of range representable by
+	     * the external data type, it is not considered a fatal error. This
+	     * request must continue to finish.
          */
     }
     else {
@@ -785,30 +838,36 @@ ncmpio_unpack_xbuf(int           fmt,   /* NC_FORMAT_CDF2 NC_FORMAT_CDF5 etc. */
         MPI_Type_free(&imaptype);
     }
 
-    /* unpacked lbuf into buf based on buftype -----------------------------*/
-    if (!buftype_is_contig && lbuf != buf) {
-        /* no need unpack when buftype is used in MPI_File_read (lbuf == buf) */
+    /* Unpacked lbuf into buf based on buftype. Note no need to unpack when
+     * buftype is used in MPI_File_read, i.e. lbuf == buf.
+     */
+    if (lbuf != buf) {
+        if (buftype_is_contig)
+            memcpy(buf, lbuf, ibuf_size);
+        else { /* buftye is not contiguous */
 #ifdef HAVE_MPI_LARGE_COUNT
-        MPI_Count position = 0;
-        mpireturn = MPI_Unpack_c(lbuf, (MPI_Count)ibuf_size, &position, buf,
-                     (MPI_Count)bufcount, buftype, MPI_COMM_SELF);
-        if (mpireturn != MPI_SUCCESS)
-            return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack_c");
-#else
-        if (bufcount > NC_MAX_INT) {
-            if (err == NC_NOERR)
-                DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
-        }
-        else {
-            int position = 0;
-            if (ibuf_size > NC_MAX_INT)
-                DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
-            mpireturn = MPI_Unpack(lbuf, (int)ibuf_size, &position, buf,
-                                   (int)bufcount, buftype, MPI_COMM_SELF);
+            MPI_Count position = 0;
+            mpireturn = MPI_Unpack_c(lbuf, (MPI_Count)ibuf_size, &position,
+                                     buf, (MPI_Count)bufcount, buftype,
+                                     MPI_COMM_SELF);
             if (mpireturn != MPI_SUCCESS)
-                return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack");
-        }
+                return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack_c");
+#else
+            if (bufcount > NC_MAX_INT) {
+                if (err == NC_NOERR)
+                    DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
+            }
+            else {
+                int position = 0;
+                if (ibuf_size > NC_MAX_INT)
+                    DEBUG_RETURN_ERROR(NC_EINTOVERFLOW)
+                mpireturn = MPI_Unpack(lbuf, (int)ibuf_size, &position, buf,
+                                       (int)bufcount, buftype, MPI_COMM_SELF);
+                if (mpireturn != MPI_SUCCESS)
+                    return ncmpii_error_mpi2nc(mpireturn, "MPI_Unpack");
+            }
 #endif
+        }
     }
     if (free_cbuf) NCI_Free(cbuf);
     if (free_lbuf) NCI_Free(lbuf);
@@ -816,3 +875,136 @@ ncmpio_unpack_xbuf(int           fmt,   /* NC_FORMAT_CDF2 NC_FORMAT_CDF5 etc. */
     return err;
 }
 
+/*----< ncmpio_calc_off() >--------------------------------------------------*/
+/* Returns the starting file offset of a subarray request.
+ * Note zero-length request should never call this subroutine.
+ * Only a single offset-length pair will call this subroutine.
+ */
+int
+ncmpio_calc_off(const NC         *ncp,
+                const NC_var     *varp,
+                const MPI_Offset *start,  /* [varp->ndims] */
+                MPI_Offset       *offset) /* OUT: start offset */
+{
+    int i, ndims = varp->ndims; /* number of dimensions of this variable */
+
+    /*
+     * varp->dsizes[] is computed from right to left product of shape
+     * For example, a 3D array of size 5x4x3 in C order,
+     * For fixed-size variable: dsizes[0]=60 dsizes[1]=12 dsizes[2]=3
+     * For record     variable: dsizes[0]=12 dsizes[1]=12 dsizes[2]=3
+     */
+    if (IS_RECVAR(varp)) {
+        *offset = 0;
+        if (ndims > 1) {
+            /* start from the least significant dimension */
+            *offset = start[ndims-1];
+            /* the remaining dimensions */
+            for (i=ndims-2; i>0; i--)
+                *offset += start[i]*varp->dsizes[i+1];
+        }
+        *offset *= varp->xsz;  /* offset in bytes */
+    }
+    else {
+        /* first handle the least significant dimension */
+        *offset = start[ndims-1];
+        /* remaining dimensions till the most significant dimension */
+        for (i=ndims-2; i>=0; i--)
+            *offset += start[i] * varp->dsizes[i+1];
+        *offset *= varp->xsz;  /* offset in bytes */
+    }
+
+    return NC_NOERR;
+}
+
+/*----< ncmpio_calc_start_end() >--------------------------------------------*/
+/* Returns the file offsets of access range of this request: starting file
+ * offset and end offset (exclusive).
+ * Note zero-length request should never call this subroutine.
+ */
+int
+ncmpio_calc_start_end(const NC         *ncp,
+                      const NC_var     *varp,
+                      const MPI_Offset *start,     /* [varp->ndims] */
+                      const MPI_Offset *count,     /* [varp->ndims] */
+                      const MPI_Offset *stride,    /* [varp->ndims] */
+                      MPI_Offset       *start_off, /* OUT: start offset */
+                      MPI_Offset       *end_off)   /* OUT: end   offset */
+{
+    int i, ndims = varp->ndims; /* number of dimensions of this variable */
+
+    /*
+     * varp->dsizes[] is computed from right to left product of shape
+     * For example, a 3D array of size 5x4x3 in C order,
+     * For fixed-size variable: dsizes[0]=60 dsizes[1]=12 dsizes[2]=3
+     * For record     variable: dsizes[0]=12 dsizes[1]=12 dsizes[2]=3
+     */
+    if (IS_RECVAR(varp)) {
+        *start_off = 0;
+        *end_off   = 0;
+        if (stride == NULL) {
+            if (ndims > 1) {
+                /* least significant dimension */
+                *start_off = start[ndims-1];
+                *end_off   = start[ndims-1]+(count[ndims-1]-1);
+                /* the remaining dimensions */
+                for (i=ndims-2; i>0; i--) {
+                    *start_off += start[i]*varp->dsizes[i+1];
+                    *end_off += (start[i]+(count[i]-1))*varp->dsizes[i+1];
+                }
+            }
+            *start_off *= varp->xsz;  /* offset in bytes */
+            *end_off   *= varp->xsz;
+            /* handle the unlimited, most significant dimension */
+            *start_off += start[0] * ncp->recsize;
+            *end_off   += (start[0]+(count[0]-1)) * ncp->recsize;
+        }
+        else {
+            if (ndims > 1) {
+                /* least significant dimension */
+                *start_off = start[ndims-1];
+                *end_off   = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1];
+                /* the remaining dimensions */
+                for (i=ndims-2; i>0; i--) {
+                    *start_off += start[i]*varp->dsizes[i+1];
+                    *end_off += (start[i]+(count[i]-1)*stride[i]) *
+                                varp->dsizes[i+1];
+                }
+            }
+            *start_off *= varp->xsz;  /* offset in bytes */
+            *end_off   *= varp->xsz;
+            /* handle the unlimited, most significant dimension */
+            *start_off += start[0] * ncp->recsize;
+            *end_off   += (start[0]+(count[0]-1)*stride[0]) * ncp->recsize;
+        }
+    }
+    else {
+        if (stride == NULL) {
+            /* first handle the least significant dimension */
+            *start_off = start[ndims-1];
+            *end_off   = start[ndims-1] + (count[ndims-1]-1);
+            /* remaining dimensions till the most significant dimension */
+            for (i=ndims-2; i>=0; i--) {
+                *start_off += start[i] * varp->dsizes[i+1];
+                *end_off += (start[i]+(count[i]-1)) * varp->dsizes[i+1];
+            }
+        }
+        else {
+            /* first handle the least significant dimension */
+            *start_off = start[ndims-1];
+            *end_off   = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1];
+            /* remaining dimensions till the most significant dimension */
+            for (i=ndims-2; i>=0; i--) {
+                *start_off += start[i] * varp->dsizes[i+1];
+                *end_off += (start[i]+(count[i]-1)*stride[i])*varp->dsizes[i+1];
+            }
+        }
+        *start_off *= varp->xsz;  /* offset in bytes */
+        *end_off   *= varp->xsz;
+    }
+    *start_off += varp->begin; /* beginning file offset of this variable */
+    *end_off   += varp->begin + varp->xsz;
+
+    return NC_NOERR;
+}
+
diff --git a/src/drivers/ncmpio/ncmpio_vard.c b/src/drivers/ncmpio/ncmpio_vard.c
index 7f3fe1224..ac032d1ac 100644
--- a/src/drivers/ncmpio/ncmpio_vard.c
+++ b/src/drivers/ncmpio/ncmpio_vard.c
@@ -55,9 +55,8 @@ getput_vard(NC               *ncp,
     void *xbuf=NULL;
     int mpireturn, status=NC_NOERR, err=NC_NOERR, xtype_is_contig=1;
     int el_size, buftype_is_contig=0, need_swap_back_buf=0;
-    int need_convert=0, need_swap=0, coll_indep, rw_flag;
-    MPI_File fh;
-    MPI_Offset nelems=0, fnelems=0, bnelems=0, offset=0;
+    int need_convert=0, need_swap=0;
+    MPI_Offset fnelems=0, bnelems=0, offset=0;
     MPI_Datatype etype=MPI_DATATYPE_NULL, xtype=MPI_BYTE;
     MPI_Offset filetype_size=0;
 #ifdef HAVE_MPI_TYPE_SIZE_C
@@ -71,6 +70,17 @@ getput_vard(NC               *ncp,
     int type_size;
 #endif
 
+    if (ncp->fstype != PNCIO_FSTYPE_MPIIO) {
+        fprintf(stderr, "PnetCDF vard APIs are only supported when using MPI-IO.\n");
+        fprintf(stderr, "Please set environment variable PNETCDF_HINTS to \"nc_pncio=disable\"\n");
+        return NC_ENOTSUPPORT;
+    }
+
+    if (ncp->num_aggrs_per_node > 0) {
+        fprintf(stderr, "PnetCDF vard APIs are not supported when intra-node agggregation is enabled\n");
+        return NC_ENOTSUPPORT;
+    }
+
 #ifdef ENABLE_SUBFILING
     /* call a separate routine if variable is stored in subfiles */
     if (varp->num_subfiles > 1) {
@@ -170,7 +180,7 @@ getput_vard(NC               *ncp,
         bnelems = bufcount;
     }
     else {
-        /* find the element type of filetype. ncmpii_dtype_decode() checks
+        /* find the element type of buftype. ncmpii_dtype_decode() checks
          * NC_EMULTITYPES */
         err = ncmpii_dtype_decode(buftype, &etype, &el_size, &bnelems, NULL,
                                   &buftype_is_contig);
@@ -214,8 +224,8 @@ getput_vard(NC               *ncp,
             }
         }
 
-        if (!need_convert &&
-            (!need_swap || (can_swap_in_place && buftype_is_contig))) {
+        if (!need_convert && buftype_is_contig &&
+            (!need_swap || can_swap_in_place)) {
             /* reuse buftype, bufcount, buf in later MPI file write */
             xbuf = buf;
             if (need_swap) {
@@ -246,7 +256,7 @@ getput_vard(NC               *ncp,
         }
     }
     else { /* read request */
-        if (!need_convert && (!need_swap || buftype_is_contig)) {
+        if (!need_convert && !need_swap && buftype_is_contig) {
             /* reuse buftype, bufcount, buf in later MPI file read */
             xbuf = buf;
         }
@@ -259,18 +269,7 @@ getput_vard(NC               *ncp,
             xtype_is_contig = 1;
         }
     }
-
-    /* Set nelems and xtype which will be used in MPI read/write */
-    if (buf != xbuf) {
-        /* xbuf is a malloc-ed contiguous buffer */
-        nelems = bnelems;
-    }
-    else {
-        /* we can safely use bufcount and buftype in MPI File read/write.
-         * Note buftype may be noncontiguous. */
-        nelems = bufcount;
-        xtype = buftype;
-    }
+assert(xtype_is_contig == 1);
 
     /* set fileview's displacement to the variable's starting file offset */
     offset = varp->begin;
@@ -296,7 +295,6 @@ getput_vard(NC               *ncp,
          */
         offset   = 0;
         bufcount = 0;
-        nelems   = 0;
         filetype_size = 0;
         filetype = MPI_BYTE;
         buftype  = MPI_BYTE;
@@ -305,31 +303,79 @@ getput_vard(NC               *ncp,
     }
     status = err;
 
+    /* set the MPI-IO fileview, this is a collective call */
+#if 1
+    /* vard API is only supported when using MPI-IO, not PNCIO */
+    char *mpi_name;
+    MPI_File fh;
+
     /* when ncp->nprocs == 1, ncp->collective_fh == ncp->independent_fh */
-    fh = ncp->independent_fh;
-    coll_indep = NC_REQ_INDEP;
-    if (ncp->nprocs > 1 && fIsSet(reqMode, NC_REQ_COLL)) {
-        fh = ncp->collective_fh;
-        coll_indep = NC_REQ_COLL;
-    }
+    fh = (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP))
+       ? ncp->collective_fh : ncp->independent_fh;
 
-    /* set the MPI-IO fileview, this is a collective call */
-    err = ncmpio_file_set_view(ncp, fh, &offset, filetype);
+    TRACE_IO(MPI_File_set_view, (fh, offset, MPI_BYTE, filetype, "native",
+                                 MPI_INFO_NULL));
+    if (mpireturn != MPI_SUCCESS) {
+        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+        if (status == NC_NOERR) status = err;
+    }
+#else
+    err = ncmpio_file_set_view(ncp, offset, filetype, 0, NULL, NULL);
+#endif
     if (err != NC_NOERR) {
         if (status == NC_NOERR) status = err;
-        nelems = 0; /* skip this request */
+        filetype_size = 0; /* skip this request */
     }
 
-    rw_flag = (fIsSet(reqMode, NC_REQ_RD)) ? NC_REQ_RD : NC_REQ_WR;
+#if 1
+    /* vard API is only supported when using MPI-IO, not PNCIO */
+    int coll_indep = NC_REQ_INDEP;
+    if (ncp->nprocs > 1 && !fIsSet(ncp->flags, NC_MODE_INDEP))
+        coll_indep = NC_REQ_COLL;
+
+    PNCIO_View buf_view;
+    buf_view.type = MPI_BYTE;
+    buf_view.size = filetype_size;
+    buf_view.count = 1;
+    buf_view.is_contig = 1;
+
+    if (fIsSet(reqMode, NC_REQ_RD)) {
+        MPI_Offset rlen;
+
+        if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL)
+            rlen = ncmpio_file_read_at_all(ncp, 0, xbuf, buf_view);
+        else
+            rlen = ncmpio_file_read_at_all(ncp, 0, xbuf, buf_view);
+        if (status == NC_NOERR && rlen < 0) status = (int)rlen;
+    }
+    else {
+        MPI_Offset wlen;
 
-    err = ncmpio_read_write(ncp, rw_flag, coll_indep, offset, nelems,
-                            xtype, xbuf, xtype_is_contig);
+        if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL)
+            wlen = ncmpio_file_write_at_all(ncp, 0, xbuf, buf_view);
+        else
+            wlen = ncmpio_file_write_at(ncp, 0, xbuf, buf_view);
+        if (status == NC_NOERR && wlen < 0) status = (int)wlen;
+    }
+#else
+    int rw_flag = (fIsSet(reqMode, NC_REQ_RD)) ? NC_REQ_RD : NC_REQ_WR;
+
+    err = ncmpio_read_write(ncp, rw_flag, 0, nelems, xtype, xbuf);
     if (status == NC_NOERR) status = err;
+#endif
 
-    /* No longer need to reset the file view, as the root's fileview includes
-     * the whole file header.
-     MPI_File_set_view(fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL);
-     */
+    /* reset fileview to make entire file visible */
+#if 1
+    TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native",
+                                 MPI_INFO_NULL));
+    if (mpireturn != MPI_SUCCESS) {
+        err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
+        if (status == NC_NOERR) status = err;
+    }
+#else
+    err = ncmpio_file_set_view(ncp, 0, MPI_BYTE, 0, NULL, NULL);
+    if (status == NC_NOERR) status = err;
+#endif
 
     if (fIsSet(reqMode, NC_REQ_RD)) {
         if (filetype_size == 0) return status;
diff --git a/src/drivers/ncmpio/ncmpio_wait.c b/src/drivers/ncmpio/ncmpio_wait.c
index fc635acfa..236499db1 100644
--- a/src/drivers/ncmpio/ncmpio_wait.c
+++ b/src/drivers/ncmpio/ncmpio_wait.c
@@ -34,71 +34,6 @@
         NetCDF XDR Level        xbuf    (XDR I/O buffer)
 */
 
-/* Prototypes for functions used only in this file */
-static int wait_getput(NC *ncp, int num_reqs, NC_req *reqs, int rw_flag,
-                       int coll_indep, MPI_Offset newnumrecs);
-
-static int mgetput(NC *ncp, int num_reqs, NC_req *reqs, int rw_flag,
-                   int coll_indep);
-
-/*----< ncmpio_getput_zero_req() >-------------------------------------------*/
-/* This function is called when this process has zero-length I/O request and
- * must participate all the MPI collective calls involved in the collective
- * APIs and wait_all(), which include setting fileview, collective read/write,
- * another setting fileview.
- *
- * This function is collective.
- */
-int
-ncmpio_getput_zero_req(NC *ncp, int reqMode)
-{
-    char *mpi_name;
-    int err, mpireturn, status=NC_NOERR;
-    MPI_Status mpistatus;
-    MPI_File fh;
-
-    /* do nothing if this came from an independent API */
-    if (fIsSet(reqMode, NC_REQ_INDEP)) return NC_NOERR;
-
-    fh = ncp->collective_fh;
-
-    TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL));
-
-    if (fIsSet(reqMode, NC_REQ_RD)) {
-        if (ncp->nprocs > 1) {
-            TRACE_IO(MPI_File_read_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus));
-        }
-        else {
-            TRACE_IO(MPI_File_read_at, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus));
-        }
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-            err = (err == NC_EFILE) ? NC_EREAD : err;
-            DEBUG_ASSIGN_ERROR(status, err)
-        }
-    } else { /* write request */
-        if (ncp->nprocs > 1) {
-            TRACE_IO(MPI_File_write_at_all, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus));
-        }
-        else {
-            TRACE_IO(MPI_File_write_at, (fh, 0, NULL, 0, MPI_BYTE, &mpistatus));
-        }
-        if (mpireturn != MPI_SUCCESS) {
-            err = ncmpii_error_mpi2nc(mpireturn, mpi_name);
-            err = (err == NC_EFILE) ? NC_EWRITE : err;
-            DEBUG_ASSIGN_ERROR(status, err)
-        }
-    }
-
-    /* No longer need to reset the file view, as the root's fileview includes
-     * the whole file header.
-     TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native",
-                                  MPI_INFO_NULL));
-     */
-
-    return status;
-}
-
 /*----< abuf_coalesce() >----------------------------------------------------*/
 /* this function should be called after all bput requests have been served */
 static int
@@ -332,389 +267,115 @@ ncmpio_cancel(void *ncdp,
     return status;
 }
 
-/*----< construct_filetypes() >----------------------------------------------*/
-/* concatenate the requests into a single MPI derived filetype */
+/*----< extract_reqs() >-----------------------------------------------------*/
+/* extract requests from the queues into new queues to be committed.
+ * Input value of num_reqs can be NC_REQ_ALL, NC_GET_REQ_ALL, or NC_PUT_REQ_ALL
+ */
 static int
-construct_filetypes(NC           *ncp,
-                    NC_lead_req  *lead_list, /* NC_REQ_WR or NC_REQ_RD */
-                    int           num_reqs,
-#ifdef HAVE_MPI_LARGE_COUNT
-                    MPI_Count    *blocklens, /* [num_reqs] temp buffer */
-                    MPI_Count    *disps,     /* [num_reqs] temp buffer */
-#else
-                    int          *blocklens, /* [num_reqs] temp buffer */
-                    MPI_Aint     *disps,     /* [num_reqs] temp buffer */
-#endif
-                    NC_req       *reqs,      /* [num_reqs] */
-                    MPI_Datatype *filetype)  /* OUT */
+extract_reqs(NC      *ncp,
+             int      num_reqs,
+             int     *req_ids,         /* IN: [num_reqs] or NULL */
+             int     *statuses,        /* IN: [num_reqs] or NULL */
+             int     *num_r_lead_reqs, /* OUT: no. lead get requests */
+             int     *num_r_reqs,      /* OUT: no. non-lead get requests */
+             NC_req **get_list,        /* OUT: extracted get requests */
+             int     *num_w_lead_reqs, /* OUT: no. lead put requests */
+             int     *num_w_reqs,      /* OUT: no. non-lead put requests */
+             NC_req **put_list)        /* OUT: extracted put requests */
 {
-    int i, j, err, status=NC_NOERR, all_ftype_contig=1, last_contig_req;
-    int mpireturn;
-    MPI_Datatype *ftypes;
-
-    if (num_reqs <= 0) { /* for participating collective call */
-        *filetype = MPI_BYTE;
-        return NC_NOERR;;
-    }
+    int i, j, status=NC_NOERR;
+    NC_req *put_list_ptr, *get_list_ptr;
 
-    /* hereinafter, num_reqs > 0 */
-    ftypes = (MPI_Datatype*) NCI_Malloc(sizeof(MPI_Datatype) * num_reqs);
+    *num_r_lead_reqs = 0;
+    *num_w_lead_reqs = 0;
+    *num_r_reqs      = 0;
+    *num_w_reqs      = 0;
 
-    /* create a filetype for each request */
-    last_contig_req = -1; /* index of the last contiguous request */
-    j = 0;                /* index of last valid ftypes */
-    for (i=0; i<num_reqs; i++, j++) {
-        int is_ftype_contig, ndims;
-        NC_lead_req *lead;
+    if (num_reqs == NC_PUT_REQ_ALL || num_reqs == NC_REQ_ALL) {
+        /* the entire put requests */
+        for (i=0; i<ncp->numLeadPutReqs; i++)
+            fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE);
 
-        lead = lead_list + reqs[i].lead_off;
-        ndims = lead->varp->ndims;
+        *num_w_lead_reqs = ncp->numLeadPutReqs;
+        *num_w_reqs      = ncp->numPutReqs;
+        *put_list        = ncp->put_list;
+        ncp->numPutReqs  = 0;
+        ncp->put_list    = NULL;
+    }
+    if (num_reqs == NC_GET_REQ_ALL || num_reqs == NC_REQ_ALL) {
+        /* the entire get requests */
+        for (i=0; i<ncp->numLeadGetReqs; i++)
+            fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE);
 
-        ftypes[j] = MPI_BYTE; /* in case the call below failed */
+        *num_r_lead_reqs = ncp->numLeadGetReqs;
+        *num_r_reqs      = ncp->numGetReqs;
+        *get_list        = ncp->get_list;
+        ncp->numGetReqs  = 0;
+        ncp->get_list    = NULL;
+    }
+    if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL ||
+                                  num_reqs == NC_PUT_REQ_ALL)
+        return NC_NOERR;
 
-        if (ndims == 0) { /* scalar variable */
-#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET
-            if (lead->varp->begin > NC_MAX_INT) {
-                DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
-                fSet(lead->flag, NC_REQ_SKIP); /* skip this request */
-                if ( lead->status != NULL &&
-                    *lead->status == NC_NOERR)
-                    *lead->status = err;
-                if (status == NC_NOERR)
-                    status = err; /* report first error */
+    if (ncp->numGetReqs == 0 && num_reqs == ncp->numLeadPutReqs) {
+        /* this is the same as NC_PUT_REQ_ALL */
+        for (i=0; i<num_reqs; i++) req_ids[i] = NC_REQ_NULL;
+        if (statuses != NULL) {
+            for (i=0; i<ncp->numLeadPutReqs; i++) {
+                ncp->put_lead_list[i].status = statuses + i;
+                statuses[i] = NC_NOERR;
             }
-#endif
-            disps[j]        = lead->varp->begin;
-            is_ftype_contig = 1;
         }
-        else { /* non-scalar variable */
-            MPI_Offset offset, *count, *stride;
-            count  = reqs[i].start + ndims;
-            stride = fIsSet(lead->flag, NC_REQ_STRIDE_NULL) ?
-                     NULL : count + ndims;
-
-            err = ncmpio_filetype_create_vars(ncp,
-                                              lead->varp,
-                                              reqs[i].start,
-                                              count,
-                                              stride,
-                                              &offset,
-                                              &ftypes[j],
-                                              &is_ftype_contig);
-
-#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET
-            if (err == NC_NOERR && offset > NC_MAX_INT)
-                DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
-#endif
-            disps[j] = (MPI_Aint)offset;
+        for (i=0; i<ncp->numLeadPutReqs; i++)
+            fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE);
 
-            if (err != NC_NOERR) {
-                fSet(lead->flag, NC_REQ_SKIP); /* skip this request */
-                if ( lead->status != NULL &&
-                    *lead->status == NC_NOERR)
-                    *lead->status = err;
-                if (status == NC_NOERR) status = err; /* report first error */
-                continue;
+        *num_w_lead_reqs = ncp->numLeadPutReqs;
+        *num_w_reqs      = ncp->numPutReqs;
+        *put_list        = ncp->put_list;
+        ncp->numPutReqs  = 0;
+        ncp->put_list    = NULL;
+        return NC_NOERR;
+    }
+    if (ncp->numPutReqs == 0 && num_reqs == ncp->numLeadGetReqs) {
+        /* this is the same as NC_GET_REQ_ALL */
+        for (i=0; i<num_reqs; i++) req_ids[i] = NC_REQ_NULL;
+        if (statuses != NULL) {
+            for (i=0; i<ncp->numLeadGetReqs; i++) {
+                ncp->get_lead_list[i].status = statuses + i;
+                statuses[i] = NC_NOERR;
             }
         }
+        for (i=0; i<ncp->numLeadGetReqs; i++)
+            fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE);
 
-        if (is_ftype_contig) {
-            MPI_Offset coalesced_len;
+        *num_r_lead_reqs = ncp->numLeadGetReqs;
+        *num_r_reqs      = ncp->numGetReqs;
+        *get_list        = ncp->get_list;
+        ncp->numGetReqs  = 0;
+        ncp->get_list    = NULL;
+        return NC_NOERR;
+    }
+    if (num_reqs == ncp->numLeadPutReqs + ncp->numLeadGetReqs &&
+        statuses == NULL) {
+        /* this is the same as NC_REQ_ALL */
+        for (i=0; i<num_reqs; i++) req_ids[i] = NC_REQ_NULL;
 
-            /* No need to construct a filetype */
-            coalesced_len = lead->varp->xsz * reqs[i].nelems;
+        for (i=0; i<ncp->numLeadGetReqs; i++)
+            fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE);
+        *num_w_lead_reqs = ncp->numLeadPutReqs;
+        *num_w_reqs      = ncp->numPutReqs;
+        *put_list        = ncp->put_list;
+        ncp->numPutReqs  = 0;
+        ncp->put_list    = NULL;
 
-#ifdef HAVE_MPI_LARGE_COUNT
-            blocklens[j] = coalesced_len;
-#else
-            if (coalesced_len > NC_MAX_INT) {
-                DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
-                if (status == NC_NOERR)
-                    status = err; /* report first error */
-                coalesced_len = 0;
-            }
-            blocklens[j] = (int)coalesced_len;
-#endif
-            if (last_contig_req >= 0)
-                coalesced_len += blocklens[last_contig_req];
-#ifdef HAVE_MPI_LARGE_COUNT
-            if (last_contig_req >= 0 &&
-                disps[j] - disps[last_contig_req] ==
-                blocklens[last_contig_req]) {
-                blocklens[last_contig_req] = coalesced_len;
-                j--;
-            }
-            else last_contig_req = j;
-#else
-            /* if coalesced_len overflows 4-byte int, then skip coalescing */
-            if (coalesced_len < NC_MAX_INT && last_contig_req >= 0 &&
-                disps[j] - disps[last_contig_req] ==
-                blocklens[last_contig_req]) {
-                blocklens[last_contig_req] = (int)coalesced_len;
-                j--;
-            }
-            else last_contig_req = j;
-#endif
-        }
-        else {
-            /* we will construct a filetype, set blocklen to 1 */
-            blocklens[j] = 1;
-            last_contig_req = -1;
-            all_ftype_contig = 0;
-        }
-    }
-    /* j is the new num_reqs */
-    num_reqs = j;
-
-    if (status != NC_NOERR) {
-        /* even if error occurs, we still must participate the collective
-           call to MPI_File_set_view() */
-        *filetype = MPI_BYTE;
-    }
-    else if (num_reqs == 1 && disps[0] == 0) {
-        if (ftypes[0] == MPI_BYTE)
-            *filetype = MPI_BYTE;
-        else {
-            mpireturn = MPI_Type_dup(ftypes[0], filetype);
-            if (mpireturn != MPI_SUCCESS)
-                err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_dup");
-        }
-    }
-    else { /* if (num_reqs > 1 || (num_reqs == 1 && disps[0] > 0)) */
-        /* all ftypes[] created fine, now concatenate all ftypes[] */
-        if (all_ftype_contig) {
-#ifdef HAVE_MPI_LARGE_COUNT
-            mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps,
-                                                   MPI_BYTE, filetype);
-#else
-            mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps,
-                                                 MPI_BYTE, filetype);
-#endif
-            if (mpireturn != MPI_SUCCESS)
-                err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed");
-            else {
-                MPI_Type_commit(filetype);
-                err = NC_NOERR;
-            }
-        }
-        else {
-#ifdef HAVE_MPI_LARGE_COUNT
-            mpireturn = MPI_Type_create_struct_c(num_reqs, blocklens, disps,
-                                                 ftypes, filetype);
-#else
-            mpireturn = MPI_Type_create_struct(num_reqs, blocklens, disps,
-                                               ftypes, filetype);
-#endif
-            if (mpireturn != MPI_SUCCESS)
-                err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct");
-            else {
-                MPI_Type_commit(filetype);
-                err = NC_NOERR;
-            }
-        }
-
-        if (err != NC_NOERR) *filetype = MPI_BYTE;
-        if (status == NC_NOERR) status = err; /* report the first error */
-    }
-
-    for (i=0; i<num_reqs; i++) {
-        if (ftypes[i] != MPI_BYTE)
-            MPI_Type_free(&ftypes[i]);
-    }
-    NCI_Free(ftypes);
-
-    return status;
-}
-
-/*----< construct_buffertypes() >--------------------------------------------*/
-/* the input requests, reqs[], are non-interleaving requests */
-static int
-construct_buffertypes(NC_lead_req  *lead_list,
-                      int           num_reqs,
-#ifdef HAVE_MPI_LARGE_COUNT
-                      MPI_Count    *blocklens, /* [num_reqs] temp buffer */
-                      MPI_Count    *disps,     /* [num_reqs] temp buffer */
-#else
-                      int          *blocklens, /* [num_reqs] temp buffer */
-                      MPI_Aint     *disps,     /* [num_reqs] temp buffer */
-#endif
-                      NC_req       *reqs,      /* [num_reqs] */
-                      MPI_Datatype *buf_type)  /* OUT */
-{
-    int i, j, k, status=NC_NOERR, mpireturn;
-    MPI_Aint a0, ai;
-
-    *buf_type = MPI_BYTE;
-    if (num_reqs == 0) return NC_NOERR;
-
-    /* create the I/O buffer derived data type */
-
-    /* calculate blocklens[], and disps[] */
-    for (i=0, j=0; i<num_reqs; i++) {
-        MPI_Offset req_size;
-        NC_lead_req *lead;
-
-        lead = lead_list + reqs[i].lead_off;
-
-        if (fIsSet(lead->flag, NC_REQ_SKIP)) continue;
-
-        req_size = lead->varp->xsz;
-        if (lead->varp->ndims > 0) { /* non-scalar variable */
-            MPI_Offset *count = reqs[i].start + lead->varp->ndims;
-            if (!IS_RECVAR(lead->varp)) req_size *= count[0];
-            for (k=1; k<lead->varp->ndims; k++) req_size *= count[k];
-        }
-
-#ifdef HAVE_MPI_LARGE_COUNT
-        blocklens[j] = req_size;
-#else
-        /* check int overflow */
-        if (req_size > NC_MAX_INT) { /* skip this request */
-            fSet(lead->flag, NC_REQ_SKIP);
-            DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW)
-            continue;
-        }
-        blocklens[j] = (int)req_size;
-#endif
-
-        MPI_Get_address(reqs[i].xbuf, &ai);
-        if (j == 0) a0 = ai;
-        disps[j] = MPI_Aint_diff(ai, a0);
-        j++;
-    }
-    /* update num_reqs to number of valid requests */
-    num_reqs = j;
-
-    if (num_reqs > 0) {
-        /* concatenate buffer addresses into a single buffer type */
-#ifdef HAVE_MPI_LARGE_COUNT
-        mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps,
-                                               MPI_BYTE, buf_type);
-#else
-        mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps,
-                                             MPI_BYTE, buf_type);
-#endif
-        if (mpireturn != MPI_SUCCESS) {
-            int err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed");
-            /* return the first encountered error if there is any */
-            if (status == NC_NOERR) status = err;
-        }
-        else
-            MPI_Type_commit(buf_type);
-    }
-
-    return status;
-}
-
-/*----< extract_reqs() >-----------------------------------------------------*/
-/* extract requests from the queues into new queues to be committed.
- * Input value of num_reqs can be NC_REQ_ALL, NC_GET_REQ_ALL, or NC_PUT_REQ_ALL
- */
-static int
-extract_reqs(NC      *ncp,
-             int      num_reqs,
-             int     *req_ids,         /* IN: [num_reqs] or NULL */
-             int     *statuses,        /* IN: [num_reqs] or NULL */
-             int     *num_r_lead_reqs, /* OUT: no. lead get requests */
-             int     *num_r_reqs,      /* OUT: no. non-lead get requests */
-             NC_req **get_list,        /* OUT: extracted get requests */
-             int     *num_w_lead_reqs, /* OUT: no. lead put requests */
-             int     *num_w_reqs,      /* OUT: no. non-lead put requests */
-             NC_req **put_list)        /* OUT: extracted put requests */
-{
-    int i, j, status=NC_NOERR;
-    NC_req *put_list_ptr, *get_list_ptr;
-
-    *num_r_lead_reqs = 0;
-    *num_w_lead_reqs = 0;
-    *num_r_reqs      = 0;
-    *num_w_reqs      = 0;
-
-    if (num_reqs == NC_PUT_REQ_ALL || num_reqs == NC_REQ_ALL) {
-        /* the entire put requests */
-        for (i=0; i<ncp->numLeadPutReqs; i++)
-            fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE);
-
-        *num_w_lead_reqs = ncp->numLeadPutReqs;
-        *num_w_reqs      = ncp->numPutReqs;
-        *put_list        = ncp->put_list;
-        ncp->numPutReqs  = 0;
-        ncp->put_list    = NULL;
-    }
-    if (num_reqs == NC_GET_REQ_ALL || num_reqs == NC_REQ_ALL) {
-        /* the entire get requests */
-        for (i=0; i<ncp->numLeadGetReqs; i++)
-            fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE);
-
-        *num_r_lead_reqs = ncp->numLeadGetReqs;
-        *num_r_reqs      = ncp->numGetReqs;
-        *get_list        = ncp->get_list;
-        ncp->numGetReqs  = 0;
-        ncp->get_list    = NULL;
-    }
-    if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL ||
-                                  num_reqs == NC_PUT_REQ_ALL)
-        return NC_NOERR;
-
-    if (ncp->numGetReqs == 0 && num_reqs == ncp->numLeadPutReqs) {
-        /* this is the same as NC_PUT_REQ_ALL */
-        for (i=0; i<num_reqs; i++) req_ids[i] = NC_REQ_NULL;
-        if (statuses != NULL) {
-            for (i=0; i<ncp->numLeadPutReqs; i++) {
-                ncp->put_lead_list[i].status = statuses + i;
-                statuses[i] = NC_NOERR;
-            }
-        }
-        for (i=0; i<ncp->numLeadPutReqs; i++)
-            fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE);
-
-        *num_w_lead_reqs = ncp->numLeadPutReqs;
-        *num_w_reqs      = ncp->numPutReqs;
-        *put_list        = ncp->put_list;
-        ncp->numPutReqs  = 0;
-        ncp->put_list    = NULL;
-        return NC_NOERR;
-    }
-    if (ncp->numPutReqs == 0 && num_reqs == ncp->numLeadGetReqs) {
-        /* this is the same as NC_GET_REQ_ALL */
-        for (i=0; i<num_reqs; i++) req_ids[i] = NC_REQ_NULL;
-        if (statuses != NULL) {
-            for (i=0; i<ncp->numLeadGetReqs; i++) {
-                ncp->get_lead_list[i].status = statuses + i;
-                statuses[i] = NC_NOERR;
-            }
-        }
-        for (i=0; i<ncp->numLeadGetReqs; i++)
-            fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE);
-
-        *num_r_lead_reqs = ncp->numLeadGetReqs;
-        *num_r_reqs      = ncp->numGetReqs;
-        *get_list        = ncp->get_list;
-        ncp->numGetReqs  = 0;
-        ncp->get_list    = NULL;
-        return NC_NOERR;
-    }
-    if (num_reqs == ncp->numLeadPutReqs + ncp->numLeadGetReqs &&
-        statuses == NULL) {
-        /* this is the same as NC_REQ_ALL */
-        for (i=0; i<num_reqs; i++) req_ids[i] = NC_REQ_NULL;
-
-        for (i=0; i<ncp->numLeadGetReqs; i++)
-            fSet(ncp->get_lead_list[i].flag, NC_REQ_TO_FREE);
-        *num_w_lead_reqs = ncp->numLeadPutReqs;
-        *num_w_reqs      = ncp->numPutReqs;
-        *put_list        = ncp->put_list;
-        ncp->numPutReqs  = 0;
-        ncp->put_list    = NULL;
-
-        for (i=0; i<ncp->numLeadPutReqs; i++)
-            fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE);
-        *num_r_lead_reqs = ncp->numLeadGetReqs;
-        *num_r_reqs      = ncp->numGetReqs;
-        *get_list        = ncp->get_list;
-        ncp->numGetReqs  = 0;
-        ncp->get_list    = NULL;
-        return NC_NOERR;
+        for (i=0; i<ncp->numLeadPutReqs; i++)
+            fSet(ncp->put_lead_list[i].flag, NC_REQ_TO_FREE);
+        *num_r_lead_reqs = ncp->numLeadGetReqs;
+        *num_r_reqs      = ncp->numGetReqs;
+        *get_list        = ncp->get_list;
+        ncp->numGetReqs  = 0;
+        ncp->get_list    = NULL;
+        return NC_NOERR;
     }
 
     /* requests are a subset of pending requests */
@@ -797,7 +458,7 @@ extract_reqs(NC      *ncp,
                     req_ids[i] == ncp->put_lead_list[j].id) {
                     memcpy(put_list_ptr,
                            ncp->put_list + ncp->put_lead_list[j].nonlead_off,
-                           ncp->put_lead_list[j].nonlead_num * sizeof(NC_req));
+                           sizeof(NC_req) * ncp->put_lead_list[j].nonlead_num);
                     put_list_ptr += ncp->put_lead_list[j].nonlead_num;
                     req_ids[i] = NC_REQ_NULL;
                     break;
@@ -810,7 +471,7 @@ extract_reqs(NC      *ncp,
                     req_ids[i] == ncp->get_lead_list[j].id) {
                     memcpy(get_list_ptr,
                            ncp->get_list + ncp->get_lead_list[j].nonlead_off,
-                           ncp->get_lead_list[j].nonlead_num * sizeof(NC_req));
+                           sizeof(NC_req) * ncp->get_lead_list[j].nonlead_num);
                     get_list_ptr += ncp->get_lead_list[j].nonlead_num;
                     req_ids[i] = NC_REQ_NULL;
                     break;
@@ -987,30 +648,72 @@ req_commit(NC  *ncp,
         do_write = (num_w_reqs > 0);
     }
 
+#if 1
     /* carry out writes and reads separately (writes first) */
-
     if (do_write > 0) {
+        err = ncmpio_ina_nreqs(ncp, NC_REQ_WR, num_w_reqs, put_list,
+                               newnumrecs);
+        put_list = NULL; /* has been freed in the above call */
+
+        /* Update the number of records if new records have been created.
+         * For nonblocking APIs, there is no way for a process to know whether
+         * others write to a record variable or not. Note newnumrecs has been
+         * sync-ed and always >= ncp->numrecs.
+         */
+        if (coll_indep == NC_REQ_COLL) {
+            if (newnumrecs > ncp->numrecs) {
+                /* update new record number in file. Note newnumrecs is already
+                 * sync-ed among all processes and in collective mode
+                 * ncp->numrecs is always sync-ed in memory among processes,
+                 * thus no need another MPI_Allreduce to sync it. */
+                err = ncmpio_write_numrecs(ncp, newnumrecs);
+                if (status == NC_NOERR) status = err;
+                /* retain the first error if there is any */
+                if (ncp->numrecs < newnumrecs) ncp->numrecs = newnumrecs;
+            }
+        }
+        else { /* NC_REQ_INDEP */
+            if (ncp->numrecs < newnumrecs) {
+                ncp->numrecs = newnumrecs;
+                set_NC_ndirty(ncp);
+                /* delay numrecs sync until end_indep, redef or close */
+            }
+        }
+    }
+    if (do_read > 0) {
+        err = ncmpio_ina_nreqs(ncp, NC_REQ_RD, num_r_reqs, get_list,
+                               newnumrecs);
+        get_list = NULL; /* has been freed in the above call */
+    }
+#else
+    if (do_write > 0) {
+        if (ncp->num_aggrs_per_node > 0 && coll_indep == NC_REQ_COLL)
+            /* intra-node aggregation must be in collective mode */
+            err = ncmpio_intra_node_aggregation_nreqs(ncp, NC_REQ_WR,
+                                                      num_w_reqs, put_list,
+                                                      newnumrecs);
+        else
+            err = wait_getput(ncp, num_w_reqs, put_list, NC_REQ_WR, coll_indep,
+                              newnumrecs);
+        put_list = NULL; /* has been freed in wait_getput() */
+    }
+
+    if (do_read > 0) {
+        if (ncp->num_aggrs_per_node > 0 && coll_indep == NC_REQ_COLL)
+            /* intra-node aggregation must be in collective mode */
+            err = ncmpio_intra_node_aggregation_nreqs(ncp, NC_REQ_RD,
+                                                      num_r_reqs, get_list,
+                                                      newnumrecs);
+        else
+            err = wait_getput(ncp, num_r_reqs, get_list, NC_REQ_RD, coll_indep,
+                              newnumrecs);
+        get_list = NULL; /* has been freed in wait_getput() */
+    }
+#endif
+
+    /* retain the first error status */
+    if (status == NC_NOERR) status = err;
 
-        if (ncp->my_aggr >= 0 && coll_indep == NC_REQ_COLL && ncp->nprocs > 1)
-            /* intra-node write aggregation must be in collective mode */
-            err = ncmpio_intra_node_aggregation_nreqs(ncp, NC_REQ_WR,
-                                                      num_w_reqs, put_list,
-                                                      newnumrecs);
-        else
-            err = wait_getput(ncp, num_w_reqs, put_list, NC_REQ_WR, coll_indep,
-                              newnumrecs);
-        put_list = NULL; /* has been freed in wait_getput() */
-    }
-
-    if (do_read > 0) {
-        err = wait_getput(ncp, num_r_reqs, get_list, NC_REQ_RD, coll_indep,
-                          newnumrecs);
-        get_list = NULL; /* has been freed in wait_getput() */
-    }
-
-    /* retain the first error status */
-    if (status == NC_NOERR) status = err;
-
     /* post-IO data processing: In write case, we may need to byte-swap user
      * write buf if it is used as the write buffer in MPI write call and the
      * target machine is little Endian. For read case, we may need to
@@ -1114,137 +817,424 @@ req_commit(NC  *ncp,
                 j++;
             }
         }
-        ncp->numLeadGetReqs = j;
-        if (ncp->numLeadGetReqs == 0) {
-            NCI_Free(ncp->get_list);
-            NCI_Free(ncp->get_lead_list);
-            ncp->get_list = NULL;
-            ncp->get_lead_list = NULL;
-        }
+        ncp->numLeadGetReqs = j;
+        if (ncp->numLeadGetReqs == 0) {
+            NCI_Free(ncp->get_list);
+            NCI_Free(ncp->get_lead_list);
+            ncp->get_list = NULL;
+            ncp->get_lead_list = NULL;
+        }
+    }
+
+    return status;
+}
+
+/*----< ncmpio_wait() >-------------------------------------------------------*/
+int
+ncmpio_wait(void *ncdp,
+            int   num_reqs,
+            int  *req_ids,   /* [num_reqs]: IN/OUT */
+            int  *statuses,  /* [num_reqs] */
+            int   reqMode)   /* only check if NC_REQ_COLL or NC_REQ_INDEP */
+{
+    NC *ncp = (NC*)ncdp;
+    int coll_indep;
+
+    if (NC_indef(ncp)) /* wait must be called in data mode */
+        DEBUG_RETURN_ERROR(NC_EINDEFINE)
+
+    coll_indep = (fIsSet(reqMode, NC_REQ_INDEP)) ? NC_REQ_INDEP : NC_REQ_COLL;
+
+#ifdef ENABLE_REQ_AGGREGATION
+    /* check collective or independent mode */
+    if (coll_indep == NC_REQ_INDEP && !NC_indep(ncp))
+        DEBUG_RETURN_ERROR(NC_ENOTINDEP)
+    else if (coll_indep == NC_REQ_COLL && NC_indep(ncp))
+        DEBUG_RETURN_ERROR(NC_EINDEP)
+
+    if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR;
+
+    return req_commit(ncp, num_reqs, req_ids, statuses, coll_indep);
+#else
+    /* If request aggregation is disabled, we call an independent wait() for
+     * each request
+     */
+    int i, status=NC_NOERR, err;
+
+    if (coll_indep == NC_REQ_INDEP) {
+        /* This is called from ncmpi_wait(), which is an independent call
+         * Argument num_reqs can be NC_REQ_ALL which means to flush all pending
+         * nonblocking requests. In this case, arguments req_ids and statuses
+         * will be ignored.
+         * Argument num_reqs must either be NC_REQ_ALL, NC_GET_REQ_ALL,
+         * NC_PUT_REQ_ALL, or a non-negative value.
+         * Argument statuses can be NULL, meaning the caller only cares about
+         * the error code returned by this call, but not the statuses of
+         * individual nonblocking requests.
+         */
+        if (num_reqs == 0) return NC_NOERR;
+
+        /* This is called from ncmpi_wait which must be called in independent
+         * data mode, illegal in collective mode.
+         */
+        if (!NC_indep(ncp)) DEBUG_RETURN_ERROR(NC_ENOTINDEP);
+
+        if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR;
+    }
+    else {
+        /* This is called from ncmpi_wait_all(), which is a collective call
+         * Argument num_reqs can be NC_REQ_ALL which means to flush all pending
+         * nonblocking requests. In this case, arguments req_ids and statuses
+         * will be ignored.
+         * Argument num_reqs must either be NC_REQ_ALL, NC_GET_REQ_ALL,
+         * NC_PUT_REQ_ALL, or a non-negative value.
+         * Argument statuses can be NULL, meaning the caller only cares about
+         * the error code returned by this call, but not the statuses of
+         * individual nonblocking requests.
+         */
+        /* the following line CANNOT be added, because ncmpi_wait_all() is a
+         * collective call, all processes must participate some MPI collective
+         * operations used later on.
+         */
+        /* if (num_reqs == 0) return NC_NOERR; */
+
+        /* This is called from ncmpi_wait_all which must be called in
+         * collective data mode, illegal in independent mode. This also
+         * ensures the program will returns back to collective mode.
+         */
+        if (NC_indep(ncp)) DEBUG_RETURN_ERROR(NC_EINDEP);
+
+        /* must enter independent mode, as num_reqs may be different among
+           processes */
+        err = ncmpio_begin_indep_data(ncp);
+        if (status == NC_NOERR) status = err;
+    }
+
+    if (num_reqs <= NC_REQ_ALL) { /* flush all get or put pending requests */
+        if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL) {
+            while (ncp->numLeadGetReqs) {
+                /* commit one request at a time. Note ncp->numLeadGetReqs
+                 * will be descreased in req_commit()
+                 */
+                err = req_commit(ncp, 1, &ncp->get_lead_list[0].id, NULL,
+                                 NC_REQ_INDEP);
+                if (status == NC_NOERR) status = err;
+            }
+        }
+        if (num_reqs == NC_REQ_ALL || num_reqs == NC_PUT_REQ_ALL) {
+            while (ncp->numLeadPutReqs) {
+                /* commit one request at a time. Note ncp->numLeadPutReqs
+                 * will be descreased in req_commit()
+                 */
+                err = req_commit(ncp, 1, &ncp->put_lead_list[0].id, NULL,
+                                 NC_REQ_INDEP);
+                if (status == NC_NOERR) status = err;
+            }
+        }
+    }
+    else {
+        for (i=0; i<num_reqs; i++) { /* commit one request at a time */
+            err = req_commit(ncp, 1, &req_ids[i],
+                  (statuses == NULL) ? NULL : &statuses[i], NC_REQ_INDEP);
+            if (status == NC_NOERR) status = err;
+        }
+    }
+
+    if (coll_indep == NC_REQ_COLL) {
+        /* return to collective data mode */
+        err = ncmpio_end_indep_data(ncp);
+        if (status == NC_NOERR) status = err;
+    }
+
+    return status; /* return the first error encountered, if there is any */
+#endif
+}
+
+#if 0
+/* Prototypes for functions used only in this file */
+static int wait_getput(NC *ncp, int num_reqs, NC_req *reqs, int rw_flag,
+                       int coll_indep, MPI_Offset newnumrecs);
+
+static int mgetput(NC *ncp, int num_reqs, NC_req *reqs, int rw_flag,
+                   int coll_indep);
+
+/*----< construct_filetypes() >----------------------------------------------*/
+/* concatenate the requests into a single MPI derived filetype */
+static int
+construct_filetypes(NC           *ncp,
+                    NC_lead_req  *lead_list, /* NC_REQ_WR or NC_REQ_RD */
+                    int           num_reqs,
+#ifdef HAVE_MPI_LARGE_COUNT
+                    MPI_Count    *blocklens, /* [num_reqs] temp buffer */
+                    MPI_Count    *disps,     /* [num_reqs] temp buffer */
+#else
+                    int          *blocklens, /* [num_reqs] temp buffer */
+                    MPI_Aint     *disps,     /* [num_reqs] temp buffer */
+#endif
+                    NC_req       *reqs,      /* [num_reqs] */
+                    MPI_Datatype *filetype)  /* OUT */
+{
+    int i, j, err, status=NC_NOERR, all_ftype_contig=1, last_contig_req;
+    int mpireturn;
+    MPI_Datatype *ftypes;
+
+    if (num_reqs <= 0) { /* for participating collective call */
+        *filetype = MPI_BYTE;
+        return NC_NOERR;;
+    }
+
+    /* hereinafter, num_reqs > 0 */
+    ftypes = (MPI_Datatype*) NCI_Malloc(sizeof(MPI_Datatype) * num_reqs);
+
+    /* create a filetype for each request */
+    last_contig_req = -1; /* index of the last contiguous request */
+    j = 0;                /* index of last valid ftypes */
+    for (i=0; i<num_reqs; i++, j++) {
+        int is_ftype_contig, ndims;
+        NC_lead_req *lead;
+
+        lead = lead_list + reqs[i].lead_off;
+        ndims = lead->varp->ndims;
+
+        ftypes[j] = MPI_BYTE; /* in case the call below failed */
+
+        if (ndims == 0) { /* scalar variable */
+#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET
+            if (lead->varp->begin > NC_MAX_INT) {
+                DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
+                fSet(lead->flag, NC_REQ_SKIP); /* skip this request */
+                if ( lead->status != NULL &&
+                    *lead->status == NC_NOERR)
+                    *lead->status = err;
+                if (status == NC_NOERR)
+                    status = err; /* report first error */
+            }
+#endif
+            disps[j]        = lead->varp->begin;
+            is_ftype_contig = 1;
+        }
+        else if (reqs[i].npairs == 1) { /* only one offset-length pair */
+            /* reqs[i].offset_start has been set back in wait_getput() */
+            disps[j] = reqs[i].offset_start;
+            is_ftype_contig = 1;
+        }
+        else { /* non-scalar variable with more offset-length pairs */
+            MPI_Offset offset, *count, *stride;
+            count  = reqs[i].start + ndims;
+            stride = fIsSet(lead->flag, NC_REQ_STRIDE_NULL) ?
+                     NULL : count + ndims;
+
+            err = ncmpio_filetype_create_vars(ncp,
+                                              lead->varp,
+                                              reqs[i].start,
+                                              count,
+                                              stride,
+                                              &offset,
+                                              &ftypes[j],
+                                              &is_ftype_contig);
+
+#if SIZEOF_MPI_AINT < SIZEOF_MPI_OFFSET
+            if (err == NC_NOERR && offset > NC_MAX_INT)
+                DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
+#endif
+            disps[j] = (MPI_Aint)offset;
+
+            if (err != NC_NOERR) {
+                fSet(lead->flag, NC_REQ_SKIP); /* skip this request */
+                if ( lead->status != NULL &&
+                    *lead->status == NC_NOERR)
+                    *lead->status = err;
+                if (status == NC_NOERR) status = err; /* report first error */
+                continue;
+            }
+        }
+
+        if (is_ftype_contig) {
+            MPI_Offset coalesced_len;
+
+            /* No need to construct a filetype */
+            coalesced_len = lead->varp->xsz * reqs[i].nelems;
+
+#ifdef HAVE_MPI_LARGE_COUNT
+            blocklens[j] = coalesced_len;
+#else
+            if (coalesced_len > NC_MAX_INT) {
+                DEBUG_ASSIGN_ERROR(err, NC_EINTOVERFLOW)
+                if (status == NC_NOERR)
+                    status = err; /* report first error */
+                coalesced_len = 0;
+            }
+            blocklens[j] = (int)coalesced_len;
+#endif
+            if (last_contig_req >= 0)
+                coalesced_len += blocklens[last_contig_req];
+#ifdef HAVE_MPI_LARGE_COUNT
+            if (last_contig_req >= 0 &&
+                disps[j] - disps[last_contig_req] ==
+                blocklens[last_contig_req]) {
+                blocklens[last_contig_req] = coalesced_len;
+                j--;
+            }
+            else last_contig_req = j;
+#else
+            /* if coalesced_len overflows 4-byte int, then skip coalescing */
+            if (coalesced_len < NC_MAX_INT && last_contig_req >= 0 &&
+                disps[j] - disps[last_contig_req] ==
+                blocklens[last_contig_req]) {
+                blocklens[last_contig_req] = (int)coalesced_len;
+                j--;
+            }
+            else last_contig_req = j;
+#endif
+        }
+        else {
+            /* we will construct a filetype, set blocklen to 1 */
+            blocklens[j] = 1;
+            last_contig_req = -1;
+            all_ftype_contig = 0;
+        }
+    }
+    /* j is the new num_reqs */
+    num_reqs = j;
+
+    if (status != NC_NOERR) {
+        /* even if error occurs, we still must participate the collective
+           call to MPI_File_set_view() */
+        *filetype = MPI_BYTE;
+    }
+    else if (num_reqs == 1 && disps[0] == 0) {
+        if (ftypes[0] == MPI_BYTE)
+            *filetype = MPI_BYTE;
+        else {
+            mpireturn = MPI_Type_dup(ftypes[0], filetype);
+            if (mpireturn != MPI_SUCCESS)
+                err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_dup");
+        }
+    }
+    else { /* if (num_reqs > 1 || (num_reqs == 1 && disps[0] > 0)) */
+        /* all ftypes[] created fine, now concatenate all ftypes[] */
+        if (all_ftype_contig) {
+#ifdef HAVE_MPI_LARGE_COUNT
+            mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps,
+                                                   MPI_BYTE, filetype);
+#else
+            mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps,
+                                                 MPI_BYTE, filetype);
+#endif
+            if (mpireturn != MPI_SUCCESS)
+                err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_hindexed");
+            else {
+                MPI_Type_commit(filetype);
+                err = NC_NOERR;
+            }
+        }
+        else {
+#ifdef HAVE_MPI_LARGE_COUNT
+            mpireturn = MPI_Type_create_struct_c(num_reqs, blocklens, disps,
+                                                 ftypes, filetype);
+#else
+            mpireturn = MPI_Type_create_struct(num_reqs, blocklens, disps,
+                                               ftypes, filetype);
+#endif
+            if (mpireturn != MPI_SUCCESS)
+                err = ncmpii_error_mpi2nc(mpireturn, "MPI_Type_create_struct");
+            else {
+                MPI_Type_commit(filetype);
+                err = NC_NOERR;
+            }
+        }
+
+        if (err != NC_NOERR) *filetype = MPI_BYTE;
+        if (status == NC_NOERR) status = err; /* report the first error */
+    }
+
+    for (i=0; i<num_reqs; i++) {
+        if (ftypes[i] != MPI_BYTE)
+            MPI_Type_free(&ftypes[i]);
     }
+    NCI_Free(ftypes);
 
     return status;
 }
 
-/*----< ncmpio_wait() >-------------------------------------------------------*/
-int
-ncmpio_wait(void *ncdp,
-            int   num_reqs,
-            int  *req_ids,   /* [num_reqs]: IN/OUT */
-            int  *statuses,  /* [num_reqs] */
-            int   reqMode)   /* only check if NC_REQ_COLL or NC_REQ_INDEP */
+/*----< construct_buffertypes() >--------------------------------------------*/
+/* the input requests, reqs[], are non-interleaving requests */
+static int
+construct_buffertypes(NC_lead_req  *lead_list,
+                      int           num_reqs,
+#ifdef HAVE_MPI_LARGE_COUNT
+                      MPI_Count    *blocklens, /* [num_reqs] temp buffer */
+                      MPI_Count    *disps,     /* [num_reqs] temp buffer */
+#else
+                      int          *blocklens, /* [num_reqs] temp buffer */
+                      MPI_Aint     *disps,     /* [num_reqs] temp buffer */
+#endif
+                      NC_req       *reqs,      /* [num_reqs] */
+                      MPI_Datatype *buf_type)  /* OUT */
 {
-    NC *ncp = (NC*)ncdp;
-    int coll_indep;
-
-    if (NC_indef(ncp)) /* wait must be called in data mode */
-        DEBUG_RETURN_ERROR(NC_EINDEFINE)
-
-    coll_indep = (fIsSet(reqMode, NC_REQ_INDEP)) ? NC_REQ_INDEP : NC_REQ_COLL;
+    int i, j, k, status=NC_NOERR, mpireturn;
+    MPI_Aint a0, ai;
 
-#ifdef ENABLE_REQ_AGGREGATION
-    /* check collective or independent mode */
-    if (coll_indep == NC_REQ_INDEP && !NC_indep(ncp))
-        DEBUG_RETURN_ERROR(NC_ENOTINDEP)
-    else if (coll_indep == NC_REQ_COLL && NC_indep(ncp))
-        DEBUG_RETURN_ERROR(NC_EINDEP)
+    *buf_type = MPI_BYTE;
+    if (num_reqs == 0) return NC_NOERR;
 
-    if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR;
+    /* create the I/O buffer derived data type */
 
-    return req_commit(ncp, num_reqs, req_ids, statuses, coll_indep);
-#else
-    /* If request aggregation is disabled, we call an independent wait() for
-     * each request
-     */
-    int i, status=NC_NOERR, err;
+    /* calculate blocklens[], and disps[] */
+    for (i=0, j=0; i<num_reqs; i++) {
+        MPI_Offset req_size;
+        NC_lead_req *lead;
 
-    if (coll_indep == NC_REQ_INDEP) {
-        /* This is called from ncmpi_wait(), which is an independent call
-         * Argument num_reqs can be NC_REQ_ALL which means to flush all pending
-         * nonblocking requests. In this case, arguments req_ids and statuses
-         * will be ignored.
-         * Argument num_reqs must either be NC_REQ_ALL, NC_GET_REQ_ALL,
-         * NC_PUT_REQ_ALL, or a non-negative value.
-         * Argument statuses can be NULL, meaning the caller only cares about
-         * the error code returned by this call, but not the statuses of
-         * individual nonblocking requests.
-         */
-        if (num_reqs == 0) return NC_NOERR;
+        lead = lead_list + reqs[i].lead_off;
 
-        /* This is called from ncmpi_wait which must be called in independent
-         * data mode, illegal in collective mode.
-         */
-        if (!NC_indep(ncp)) DEBUG_RETURN_ERROR(NC_ENOTINDEP);
+        if (fIsSet(lead->flag, NC_REQ_SKIP)) continue;
 
-        if (coll_indep == NC_REQ_INDEP && num_reqs == 0) return NC_NOERR;
-    }
-    else {
-        /* This is called from ncmpi_wait_all(), which is a collective call
-         * Argument num_reqs can be NC_REQ_ALL which means to flush all pending
-         * nonblocking requests. In this case, arguments req_ids and statuses
-         * will be ignored.
-         * Argument num_reqs must either be NC_REQ_ALL, NC_GET_REQ_ALL,
-         * NC_PUT_REQ_ALL, or a non-negative value.
-         * Argument statuses can be NULL, meaning the caller only cares about
-         * the error code returned by this call, but not the statuses of
-         * individual nonblocking requests.
-         */
-        /* the following line CANNOT be added, because ncmpi_wait_all() is a
-         * collective call, all processes must participate some MPI collective
-         * operations used later on.
-         */
-        /* if (num_reqs == 0) return NC_NOERR; */
+        req_size = lead->varp->xsz;
+        if (lead->varp->ndims > 0) { /* non-scalar variable */
+            MPI_Offset *count = reqs[i].start + lead->varp->ndims;
+            if (!IS_RECVAR(lead->varp)) req_size *= count[0];
+            for (k=1; k<lead->varp->ndims; k++) req_size *= count[k];
+        }
 
-        /* This is called from ncmpi_wait_all which must be called in
-         * collective data mode, illegal in independent mode. This also
-         * ensures the program will returns back to collective mode.
-         */
-        if (NC_indep(ncp)) DEBUG_RETURN_ERROR(NC_EINDEP);
+#ifdef HAVE_MPI_LARGE_COUNT
+        blocklens[j] = req_size;
+#else
+        /* check int overflow */
+        if (req_size > NC_MAX_INT) { /* skip this request */
+            fSet(lead->flag, NC_REQ_SKIP);
+            DEBUG_ASSIGN_ERROR(status, NC_EINTOVERFLOW)
+            continue;
+        }
+        blocklens[j] = (int)req_size;
+#endif
 
-        /* must enter independent mode, as num_reqs may be different among
-           processes */
-        err = ncmpio_begin_indep_data(ncp);
-        if (status == NC_NOERR) status = err;
+        MPI_Get_address(reqs[i].xbuf, &ai);
+        if (j == 0) a0 = ai;
+        disps[j] = MPI_Aint_diff(ai, a0);
+        j++;
     }
+    /* update num_reqs to number of valid requests */
+    num_reqs = j;
 
-    if (num_reqs <= NC_REQ_ALL) { /* flush all get or put pending requests */
-        if (num_reqs == NC_REQ_ALL || num_reqs == NC_GET_REQ_ALL) {
-            while (ncp->numLeadGetReqs) {
-                /* commit one request at a time. Note ncp->numLeadGetReqs
-                 * will be descreased in req_commit()
-                 */
-                err = req_commit(ncp, 1, &ncp->get_lead_list[0].id, NULL,
-                                 NC_REQ_INDEP);
-                if (status == NC_NOERR) status = err;
-            }
-        }
-        if (num_reqs == NC_REQ_ALL || num_reqs == NC_PUT_REQ_ALL) {
-            while (ncp->numLeadPutReqs) {
-                /* commit one request at a time. Note ncp->numLeadPutReqs
-                 * will be descreased in req_commit()
-                 */
-                err = req_commit(ncp, 1, &ncp->put_lead_list[0].id, NULL,
-                                 NC_REQ_INDEP);
-                if (status == NC_NOERR) status = err;
-            }
-        }
-    }
-    else {
-        for (i=0; i<num_reqs; i++) { /* commit one request at a time */
-            err = req_commit(ncp, 1, &req_ids[i],
-                  (statuses == NULL) ? NULL : &statuses[i], NC_REQ_INDEP);
+    if (num_reqs > 0) {
+        /* concatenate buffer addresses into a single buffer type */
+#ifdef HAVE_MPI_LARGE_COUNT
+        mpireturn = MPI_Type_create_hindexed_c(num_reqs, blocklens, disps,
+                                               MPI_BYTE, buf_type);
+#else
+        mpireturn = MPI_Type_create_hindexed(num_reqs, blocklens, disps,
+                                             MPI_BYTE, buf_type);
+#endif
+        if (mpireturn != MPI_SUCCESS) {
+            int err = ncmpii_error_mpi2nc(mpireturn,"MPI_Type_create_hindexed");
+            /* return the first encountered error if there is any */
             if (status == NC_NOERR) status = err;
         }
+        else
+            MPI_Type_commit(buf_type);
     }
 
-    if (coll_indep == NC_REQ_COLL) {
-        /* return to collective data mode */
-        err = ncmpio_end_indep_data(ncp);
-        if (status == NC_NOERR) status = err;
-    }
-
-    return status; /* return the first error encountered, if there is any */
-#endif
+    return status;
 }
 
 /* C struct for breaking down a request to a list of offset-length segments */
@@ -1381,8 +1371,8 @@ merge_requests(NC          *ncp,
                MPI_Offset  *nsegs,   /* OUT: no. off-len pairs */
                off_len    **segs)    /* OUT: [*nsegs] */
 {
-    int i, j, status=NC_NOERR, ndims;
-    MPI_Offset  nseg, *start, *count, *shape, *stride;
+    int i, j, status=NC_NOERR, ndims, is_incr;
+    MPI_Offset nseg, *start, *count, *shape, *stride, prev_offset;
     MPI_Aint addr, buf_addr;
 
     *nsegs = 0;    /* total number of offset-length pairs */
@@ -1397,43 +1387,18 @@ merge_requests(NC          *ncp,
     /* Count the number off-len pairs from reqs[], so we can malloc a
      * contiguous memory space for storing off-len pairs
      */
-    for (i=0; i<num_reqs; i++) {
-        NC_lead_req *lead = lead_list + reqs[i].lead_off;
-        ndims = lead->varp->ndims;
-        if (ndims > 0) {
-            start  = reqs[i].start;
-            count  = start + ndims;
-            stride = count + ndims;
-        }
-        else
-            start = count = stride = NULL;
-
-        /* for record variable, each reqs[] is within a record */
-        if (IS_RECVAR(lead->varp)) {
-            ndims--;
-            start++;
-            count++;
-            stride++;
-        }
-        if (fIsSet(lead->flag, NC_REQ_STRIDE_NULL)) stride = NULL;
-
-        if (ndims < 0) continue;
-        if (ndims == 0) {  /* 1D record variable */
-            (*nsegs)++;
-            continue;
-        }
-        nseg = 1;
-        if (stride != NULL && stride[ndims-1] > 1)
-            nseg = count[ndims-1];  /* count of last dimension */
-        for (j=0; j<ndims-1; j++)
-            nseg *= count[j];  /* all count[] except the last dimension */
-
-        *nsegs += nseg;
-    }
+    for (i=0; i<num_reqs; i++)
+        /* reqs[i].npairs is the number of offset-length pairs of this request,
+         * calculated in ncmpio_igetput_varm() and igetput_varn()
+         */
+        *nsegs += reqs[i].npairs;
 
     /* now we can allocate a contiguous memory space for the off-len pairs */
-    off_len *seg_ptr = (off_len*)NCI_Malloc(sizeof(off_len) * (*nsegs));
-    *segs = seg_ptr;
+    *segs = (off_len*)NCI_Malloc(sizeof(off_len) * (*nsegs));
+    off_len *seg_ptr = *segs;
+
+    prev_offset = -1;
+    is_incr = 1;
 
     /* now re-run the loop to fill in the off-len pairs */
     for (i=0; i<num_reqs; i++) {
@@ -1444,6 +1409,19 @@ merge_requests(NC          *ncp,
         MPI_Get_address(reqs[i].xbuf, &addr);
         addr = MPI_Aint_diff(addr, buf_addr);  /* distance to the buf of first req */
 
+        if (reqs[i].npairs == 1) {
+            /* reqs[i].offset_start has been set back in wait_getput() */
+            seg_ptr->off = reqs[i].offset_start;
+            seg_ptr->len = reqs[i].nelems * lead->varp->xsz;
+            seg_ptr->buf_addr = addr;
+            if (prev_offset > seg_ptr->off)
+                is_incr = 0;  /* offsets are not incrementing */
+            else
+                prev_offset = seg_ptr->off;
+            seg_ptr++;
+            continue;
+        }
+
         ndims = lead->varp->ndims;
         if (ndims > 0) {
             start  = reqs[i].start;
@@ -1476,15 +1454,18 @@ merge_requests(NC          *ncp,
                      addr, start, count, stride,
                      &nseg,    /* OUT: number of offset-length pairs */
                      seg_ptr); /* OUT: array of offset-length pairs */
+
+        /* check if (*segs)[].off are in an increasing order */
+        for (j=0; j<nseg; j++) {
+            if (prev_offset > seg_ptr[j].off)
+                is_incr = 0;  /* offsets are not incrementing */
+            else
+                prev_offset = seg_ptr[j].off;
+        }
         seg_ptr += nseg; /* append the list to the end of segs array */
     }
 
-    /* check if (*segs)[].off are in an increasing order */
-    for (i=1; i<*nsegs; i++) {
-        if ((*segs)[i-1].off > (*segs)[i].off)
-            break;
-    }
-    if (i < *nsegs) /* not in an increasing order */
+    if (!is_incr) /* not in an increasing order */
         /* sort the off-len array, segs[], in an increasing order */
         qsort(*segs, (size_t)(*nsegs), sizeof(off_len), off_compare);
 
@@ -1751,8 +1732,7 @@ req_aggregation(NC     *ncp,
     void *buf; /* point to starting buffer, used by MPI-IO call */
     MPI_Aint      b_begin, b_addr;
     MPI_Datatype  filetype, buf_type, *ftypes, *btypes;
-    MPI_File fh;
-    MPI_Offset max_end, offset;
+    MPI_Offset max_end;
 
     if (num_reqs == 0) { /* only NC_REQ_COLL can reach here for 0 request */
         assert(coll_indep == NC_REQ_COLL);
@@ -2064,13 +2044,8 @@ req_aggregation(NC     *ncp,
     }
     NCI_Free(reqs);
 
-    fh = ncp->independent_fh;
-    if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL)
-        fh = ncp->collective_fh;
-
     /* set the MPI-IO fileview, this is a collective call */
-    offset = 0;
-    err = ncmpio_file_set_view(ncp, fh, &offset, filetype);
+    err = ncmpio_file_set_view(ncp, 0, filetype, 0, NULL, NULL);
     if (filetype != MPI_BYTE) MPI_Type_free(&filetype);
     if (err != NC_NOERR) {
         if (status == NC_NOERR) status = err;
@@ -2079,112 +2054,25 @@ req_aggregation(NC     *ncp,
     }
 
     /* call MPI_File_read_at_all/MPI_File_write_at_all */
-    err = ncmpio_read_write(ncp, rw_flag, coll_indep, offset, buf_len, buf_type,
-                            buf, ((buf_type == MPI_BYTE) ? 1 : 0));
+    // err = ncmpio_read_write(ncp, rw_flag, 0, buf_len, buf_type, buf);
+
+assert(0);
+/* This subroutine is no longer used.
+    PNCIO_View buf_view;
+    err = ncmpio_read_write(ncp, rw_flag, 0, buf_view, buf);
+*/
+
     if (status == NC_NOERR) status = err;
 
     if (buf_type != MPI_BYTE) MPI_Type_free(&buf_type);
 
     /* No longer need to reset the file view, as the root's fileview includes
      * the whole file header.
-     TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native",
-                                  MPI_INFO_NULL));
      */
 
     return status;
 }
 
-/*----< calculate_access_range() >-------------------------------------------*/
-/* Returns the file offsets of access range of this request: starting file
- * offset and end offset (exclusive).
- * Note zero-length request should never call this subroutine.
- */
-static int
-calculate_access_range(const NC         *ncp,
-                       const NC_var     *varp,
-                       const MPI_Offset *start,     /* [varp->ndims] */
-                       const MPI_Offset *count,     /* [varp->ndims] */
-                       const MPI_Offset *stride,    /* [varp->ndims] */
-                       MPI_Offset       *start_off, /* OUT: start offset */
-                       MPI_Offset       *end_off)   /* OUT: end   offset */
-{
-    int i, ndims = varp->ndims; /* number of dimensions of this variable */
-
-    /*
-     * varp->dsizes[] is computed from right to left product of shape
-     * For example, a 3D array of size 5x4x3 in C order,
-     * For fixed-size variable: dsizes[0]=60 dsizes[1]=12 dsizes[2]=3
-     * For record     variable: dsizes[0]=12 dsizes[1]=12 dsizes[2]=3
-     */
-    if (IS_RECVAR(varp)) {
-        *start_off = 0;
-        *end_off   = 0;
-        if (stride == NULL) {
-            if (ndims > 1) {
-                /* least significant dimension */
-                *start_off = start[ndims-1];
-                *end_off   = start[ndims-1]+(count[ndims-1]-1);
-                /* the remaining dimensions */
-                for (i=ndims-2; i>0; i--) {
-                    *start_off += start[i]*varp->dsizes[i+1];
-                    *end_off += (start[i]+(count[i]-1))*varp->dsizes[i+1];
-                }
-            }
-            *start_off *= varp->xsz;  /* offset in bytes */
-            *end_off   *= varp->xsz;
-            /* handle the unlimited, most significant dimension */
-            *start_off += start[0] * ncp->recsize;
-            *end_off   += (start[0]+(count[0]-1)) * ncp->recsize;
-        }
-        else {
-            if (ndims > 1) {
-                /* least significant dimension */
-                *start_off = start[ndims-1];
-                *end_off   = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1];
-                /* the remaining dimensions */
-                for (i=ndims-2; i>0; i--) {
-                    *start_off += start[i]*varp->dsizes[i+1];
-                    *end_off += (start[i]+(count[i]-1)*stride[i]) *
-                                varp->dsizes[i+1];
-                }
-            }
-            *start_off *= varp->xsz;  /* offset in bytes */
-            *end_off   *= varp->xsz;
-            /* handle the unlimited, most significant dimension */
-            *start_off += start[0] * ncp->recsize;
-            *end_off   += (start[0]+(count[0]-1)*stride[0]) * ncp->recsize;
-        }
-    }
-    else {
-        if (stride == NULL) {
-            /* first handle the least significant dimension */
-            *start_off = start[ndims-1];
-            *end_off   = start[ndims-1] + (count[ndims-1]-1);
-            /* remaining dimensions till the most significant dimension */
-            for (i=ndims-2; i>=0; i--) {
-                *start_off += start[i] * varp->dsizes[i+1];
-                *end_off += (start[i]+(count[i]-1)) * varp->dsizes[i+1];
-            }
-        }
-        else {
-            /* first handle the least significant dimension */
-            *start_off = start[ndims-1];
-            *end_off   = start[ndims-1]+(count[ndims-1]-1)*stride[ndims-1];
-            /* remaining dimensions till the most significant dimension */
-            for (i=ndims-2; i>=0; i--) {
-                *start_off += start[i] * varp->dsizes[i+1];
-                *end_off += (start[i]+(count[i]-1)*stride[i])*varp->dsizes[i+1];
-            }
-        }
-        *start_off *= varp->xsz;  /* offset in bytes */
-        *end_off   *= varp->xsz;
-    }
-    *start_off += varp->begin; /* beginning file offset of this variable */
-    *end_off   += varp->begin + varp->xsz;
-
-    return NC_NOERR;
-}
-
 /*----< wait_getput() >------------------------------------------------------*/
 static int
 wait_getput(NC         *ncp,
@@ -2210,8 +2098,17 @@ wait_getput(NC         *ncp,
         varp = lead->varp;
 
         if (varp->ndims == 0) { /* scalar variable */
-            reqs[i].offset_start = varp->begin;
-            reqs[i].offset_end   = varp->begin + varp->xsz;
+            reqs[i].offset_start += varp->begin;
+            reqs[i].offset_end   += varp->begin;
+        }
+        else if (reqs[i].npairs == 1) { /* only one offset-length pair */
+            /* reqs[i].offset_end == reqs[i].nelems * varp->xsz */
+            MPI_Offset off = varp->begin;
+
+            if (IS_RECVAR(varp)) off += reqs[i].start[0] * ncp->recsize;
+
+            reqs[i].offset_start += off;
+            reqs[i].offset_end   += off;
         }
         else {
             /* start/count/stride have been allocated in a contiguous array */
@@ -2221,8 +2118,8 @@ wait_getput(NC         *ncp,
                      count + varp->ndims;
 
             /* calculate access range of this request */
-            calculate_access_range(ncp, varp, reqs[i].start, count, stride,
-                                   &reqs[i].offset_start, &reqs[i].offset_end);
+            ncmpio_calc_start_end(ncp, varp, reqs[i].start, count, stride,
+                                  &reqs[i].offset_start, &reqs[i].offset_end);
         }
         if (i > 0) {
             /* check if offset_start are in a monotonic nondecreasing order */
@@ -2304,8 +2201,7 @@ mgetput(NC     *ncp,
     void *buf=NULL;
     NC_lead_req *lead_list;
     MPI_Datatype filetype, buf_type=MPI_BYTE;
-    MPI_Offset offset=0, buf_count=0;
-    MPI_File fh;
+    MPI_Offset buf_count=0;
 
 #ifdef HAVE_MPI_LARGE_COUNT
     MPI_Count *blocklens;
@@ -2489,12 +2385,8 @@ mgetput(NC     *ncp,
 mpi_io:
     NCI_Free(reqs);
 
-    fh = ncp->independent_fh;
-    if (ncp->nprocs > 1 && coll_indep == NC_REQ_COLL)
-        fh = ncp->collective_fh;
-
     /* set the MPI-IO fileview, this is a collective call */
-    err = ncmpio_file_set_view(ncp, fh, &offset, filetype);
+    err = ncmpio_file_set_view(ncp, 0, filetype, 0, NULL, NULL);
     if (filetype != MPI_BYTE) MPI_Type_free(&filetype);
     if (err != NC_NOERR) {
         if (status == NC_NOERR) status = err;
@@ -2503,17 +2395,19 @@ mgetput(NC     *ncp,
     }
 
     /* call MPI_File_read_at_all/MPI_File_write_at_all */
-    err = ncmpio_read_write(ncp, rw_flag, coll_indep, offset, buf_count,
-                            buf_type, buf, ((buf_type == MPI_BYTE) ? 1 : 0));
+    // err = ncmpio_read_write(ncp, rw_flag, 0, buf_count, buf_type, buf);
+    assert(0);
+    PNCIO_View buf_view;
+    err = ncmpio_read_write(ncp, rw_flag, 0, buf_view, buf);
     if (status == NC_NOERR) status = err;
 
     if (buf_type != MPI_BYTE) MPI_Type_free(&buf_type);
 
     /* No longer need to reset the file view, as the root's fileview includes
      * the whole file header.
-     TRACE_IO(MPI_File_set_view, (fh, 0, MPI_BYTE, MPI_BYTE, "native",
-                                  MPI_INFO_NULL));
      */
 
     return status;
 }
+#endif
+
diff --git a/src/drivers/pncio/Makefile.am b/src/drivers/pncio/Makefile.am
new file mode 100644
index 000000000..f5527c8d5
--- /dev/null
+++ b/src/drivers/pncio/Makefile.am
@@ -0,0 +1,51 @@
+#
+# Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+# See COPYRIGHT notice in top-level directory.
+#
+# @configure_input@
+
+SUFFIXES = .a .o .c .m4 .h
+
+AM_CPPFLAGS  = -I${top_srcdir}/src/include
+AM_CPPFLAGS += -I${top_builddir}/src/include
+AM_CPPFLAGS += -I${top_srcdir}/src/drivers/include
+AM_CPPFLAGS += -I${top_builddir}/src/drivers/include
+
+if PNETCDF_DEBUG
+   AM_CPPFLAGS += -DPNETCDF_DEBUG
+endif
+
+noinst_LTLIBRARIES = libpncio.la
+
+H_SRCS = pncio.h
+
+C_SRCS = pncio_read.c \
+         pncio_write.c \
+         pncio_open.c \
+         pncio_close.c \
+         pncio_fstype.c \
+         pncio_aggregate.c \
+         pncio_read_str.c \
+         pncio_read_coll.c \
+         pncio_read_str_naive.c \
+         pncio_write_coll.c \
+         pncio_write_str.c \
+         pncio_write_str_naive.c \
+         pncio_utils.c \
+         pncio_lustre_open.c \
+         pncio_lustre_wrcoll.c \
+         pncio_lustre_wrstr.c \
+         pncio_lock.c \
+         pncio_set_size.c \
+         pncio_sync.c \
+         pncio_delete.c \
+         pncio_set_view.c \
+         pncio_hints.c
+
+
+libpncio_la_SOURCES = $(C_SRCS) $(H_SRCS)
+
+CLEANFILES = core core.* *.gcda *.gcno *.gcov gmon.out
+
+tests-local: all
+
diff --git a/src/drivers/pncio/pncio.h b/src/drivers/pncio/pncio.h
new file mode 100644
index 000000000..035febd79
--- /dev/null
+++ b/src/drivers/pncio/pncio.h
@@ -0,0 +1,354 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifndef H_PNCIO
+#define H_PNCIO
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/errno.h>
+#include <unistd.h>   /* pwrite() */
+
+#include <stdbool.h>
+#include <string.h>     /* memcpy() */
+#include <stddef.h>     /* size_t */
+#include <sys/types.h>  /* off_t */
+#include <assert.h>
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#endif
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif
+#define FDTYPE int
+
+#include <pnc_debug.h>
+#include <common.h>
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+#define NMEASURES 8
+#endif
+
+#define PNCIO_LOCKS  300    /* file system supports fcntl()-style locking */
+#define PNCIO_Feature(a, b) ((b == PNCIO_LOCKS) ? 1 : 0)
+
+#if defined(F_SETLKW64)
+#define PNCIO_UNLOCK(fd, offset, whence, len) \
+        PNCIO_GEN_SetLock64(fd, F_SETLK, F_UNLCK, offset, whence, len)
+#define PNCIO_WRITE_LOCK(fd, offset, whence, len) \
+        PNCIO_GEN_SetLock64(fd, F_SETLKW, F_WRLCK, offset, whence, len)
+#else
+#define PNCIO_UNLOCK(fd, offset, whence, len) \
+        PNCIO_GEN_SetLock(fd, F_SETLK, F_UNLCK, offset, whence, len)
+#define PNCIO_WRITE_LOCK(fd, offset, whence, len) \
+        PNCIO_GEN_SetLock(fd, F_SETLKW, F_WRLCK, offset, whence, len)
+#endif
+
+
+#define PNCIO_PERM          0666   /* file creation permission mask */
+
+#define PNCIO_UFS           152    /* Unix file system */
+#define PNCIO_LUSTRE        163    /* Lustre */
+#define PNCIO_FSTYPE_MPIIO  -1     /* Use MPI-IO */
+#define PNCIO_FSTYPE_CHECK  0      /* Use PnetCDF PNCIO drivers */
+
+#define PNCIO_LUSTRE_MAX_OSTS 256  /* Maximum number of Lustre OSTs if hint
+                                    * striping_factor is not set by user.
+                                    */
+
+#define PNCIO_CB_BUFFER_SIZE_DFLT     "16777216"
+#define PNCIO_IND_RD_BUFFER_SIZE_DFLT "4194304"
+#define PNCIO_IND_WR_BUFFER_SIZE_DFLT "524288"
+#define PNCIO_CB_CONFIG_LIST_DFLT     "*:1"
+
+/* PNCIO_DS_WR_NPAIRS_LB is the lower bound of the total number of
+ *     offset-length pairs over the non-aggregator senders to be received by an
+ *     I/O aggregator to skip the potentially expensive heap-merge sort that
+ *     determines whether or not data sieving write is necessary.
+ * PNCIO_DS_WR_NAGGRS_LB is the lower bound of the number of non-aggregators
+ *     sending their offset-length pairs to an I/O aggregator.
+ * Both conditions must be met to skip the heap-merge sort.
+ *
+ * When data sieving is enabled, read-modify-write will perform at each round
+ * of two-phase I/O at each aggregator. The following describes whether
+ * detecting "holes" in a write region is necessary, depending on the data
+ * sieving hint, romio_ds_write, is set to enable/disable/automatic.
+ *   + automatic - We need to check whether holes exist. If holes exist, the
+ *       "read-modify" part must run. If not, "read-modify" can be skipped.
+ *   + enable - "read-modify" part must perform, skip hole checking, and thus
+ *       skip the heap-merge sort.
+ *   + disable - "read-modify" part must skip, need not check holes, but must
+ *       construct srt_off_len to merge all others_req[] into a single sorted
+ *       list, which requires to call a heap-merge sort. This step is necessary
+ *       because write data from all non-aggregators are received into the same
+ *       write_buf, with a possibility of overlaps, and srt_off_len stores the
+ *       coalesced offset-length pairs of individual non-contiguous write
+ *       request and will be used to write them to the file.
+ *
+ * Heap-merge sort merges offset-length pairs received from all non-aggregators
+ * into a single list, which can be expensive. Its cost can be even larger than
+ * the cost of "read" in "read-modify-write". Below two constants are the lower
+ * bounds used to determine whether or not to perform such sorting, when data
+ * sieving is set to the automatic mode.
+ */
+#define PNCIO_DS_WR_NPAIRS_LB 8192
+#define PNCIO_DS_WR_NAGGRS_LB 256
+#define DO_HEAP_MERGE(nrecv, npairs) ((nrecv) > PNCIO_DS_WR_NAGGRS_LB || (npairs) > PNCIO_DS_WR_NPAIRS_LB)
+
+#define PNCIO_TYPE_DECREASE 0x00000001  /* if not monotonic nondecreasing */
+#define PNCIO_TYPE_OVERLAP  0x00000002  /* if contains overlapping regions */
+#define PNCIO_TYPE_NEGATIVE 0x00000004  /* if one of displacements is negative */
+
+enum {
+    PNCIO_HINT_AUTO = 0,
+    PNCIO_HINT_ENABLE = 1,
+    PNCIO_HINT_DISABLE = 2
+};
+
+typedef struct {
+    int striping_factor;
+    int striping_unit;
+    int cb_read;
+    int cb_write;
+    int cb_nodes;
+    int cb_buffer_size;
+    int ds_read;
+    int ds_write;
+    int no_indep_rw;
+    int ind_rd_buffer_size;
+    int ind_wr_buffer_size;
+    int start_iodevice;
+    int *ranklist;
+
+    union {
+        struct {
+            int num_osts;
+            int overstriping_ratio;
+        } lustre;
+    } fs_hints;
+} PNCIO_Hints;
+
+typedef struct {
+    MPI_Datatype type;      /* MPI derived datatype */
+    MPI_Offset   size;      /* total size in bytes (sum of len[*]) */
+    MPI_Count    count;     /* number of off-len pairs */
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Offset  *off;       /* [count] byte offsets */
+    MPI_Offset  *len;       /* [count] block lengths in bytes */
+#else
+    MPI_Offset  *off;       /* [count] byte offsets */
+    int         *len;       /* [count] block lengths in bytes */
+#endif
+    MPI_Count    idx;       /* index of off-len pairs consumed so far */
+    MPI_Aint     rem;       /* remaining amount in the pair to be consumed */
+    int          is_contig; /* whether view of file or buffer is contiguous */
+} PNCIO_View;
+
+typedef struct {
+    MPI_Comm comm;          /* communicator indicating who called open */
+    const char *filename;
+    int file_system;        /* type of file system */
+
+    int fd_sys;             /* system file descriptor */
+    int num_nodes;          /* number of unique compute nodes from
+                             * MPI_Get_processor_name() */
+    int *node_ids;          /* [nprocs] node IDs of each rank */
+    int access_mode;        /* Access mode (sequential, append, etc.),
+                             * possibly modified to deal with
+                             * data sieving or deferred open */
+
+    int is_open;            /* no_indep_rw, 0: not open yet 1: is open */
+
+    int skip_read;          /* whether to skip reads in read-modify-write */
+
+    MPI_Offset disp;        /* file displacement */
+    MPI_Datatype filetype;  /* file type set in fileview */
+                            /* etype in fileview is always MPI_BYTE in PnetCDF */
+    PNCIO_View flat_file;   /* flattern filetype */
+
+    int atomicity;          /* true=atomic, false=nonatomic */
+    char *io_buf;           /* two-phase buffer allocated out of i/o path */
+    int is_agg;             /* bool: if I am an aggregator */
+    int my_cb_nodes_index;  /* my index into fd->hints->ranklist[]. -1 if N/A */
+    PNCIO_Hints *hints;     /* structure containing fs-indep. info values */
+    MPI_Info info;
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    double write_timing[NMEASURES];
+    double read_timing[NMEASURES];
+    MPI_Count write_counter[NMEASURES];
+    MPI_Count read_counter[NMEASURES];
+#endif
+} PNCIO_File;
+
+typedef struct {
+    MPI_Offset *offsets;  /* array of offsets */
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Offset *lens;     /* array of lengths */
+    MPI_Count  *mem_ptrs; /* array of pointers. used in the read/write phase to
+                           * indicate where the data is stored in memory
+                           * promoted to MPI_Count so we can construct types
+                           * with _c versions
+                           */
+    MPI_Count   count;    /* size of above arrays */
+#else
+    int        *lens;
+    MPI_Aint   *mem_ptrs;
+    size_t      count;
+#endif
+    size_t curr; /* index of offsets/lens that is currently being processed */
+} PNCIO_Access;
+
+/*---- APIs -----------------------------------------------------------------*/
+extern
+int PNCIO_FileSysType(const char *filename);
+
+extern
+int PNCIO_File_open(MPI_Comm comm, const char *filename, int amode,
+                MPI_Info info, PNCIO_File *fh);
+
+extern
+int PNCIO_File_close(PNCIO_File *fh);
+
+extern
+int PNCIO_File_set_view(PNCIO_File *fh, MPI_Offset disp, MPI_Datatype filetype,
+                MPI_Aint npairs,
+#ifdef HAVE_MPI_LARGE_COUNT
+                MPI_Count *offsets, MPI_Count *lengths
+#else
+                MPI_Offset *offsets, int *lengths
+#endif
+);
+
+extern
+int PNCIO_File_sync(PNCIO_File *fh);
+
+extern
+int PNCIO_File_delete(const char *filename);
+
+extern
+int PNCIO_File_set_size(PNCIO_File *fh, MPI_Offset size);
+
+extern
+int PNCIO_File_get_size(PNCIO_File *fh, MPI_Offset *size);
+
+extern
+int PNCIO_File_get_info(PNCIO_File *fh, MPI_Info *info_used);
+
+extern
+int PNCIO_File_SetInfo(PNCIO_File *fh, MPI_Info  users_info);
+
+/* PNC I/O APIs */
+extern
+MPI_Offset PNCIO_File_write_at(PNCIO_File *fh, MPI_Offset offset,
+                const void *buf, PNCIO_View buf_view);
+extern
+MPI_Offset PNCIO_File_write_at_all(PNCIO_File *fh, MPI_Offset offset,
+                const void *buf, PNCIO_View buf_view);
+
+extern
+MPI_Offset PNCIO_File_read_at(PNCIO_File *fh, MPI_Offset offset, void *buf,
+                PNCIO_View buf_view);
+extern
+MPI_Offset PNCIO_File_read_at_all(PNCIO_File *fh, MPI_Offset offset, void *buf,
+                PNCIO_View buf_view);
+
+extern
+MPI_Offset PNCIO_WriteContig(PNCIO_File *fd, const void *buf,
+                MPI_Offset w_size, MPI_Offset offset);
+
+extern
+MPI_Offset PNCIO_ReadContig(PNCIO_File *fd, void *buf, MPI_Offset r_size,
+                MPI_Offset offset);
+
+/* utility APIs */
+extern
+void PNCIO_Calc_file_domains(MPI_Offset * st_offsets,
+                MPI_Offset *end_offsets, int nprocs, int nprocs_for_coll,
+                MPI_Offset *min_st_offset_ptr, MPI_Offset **fd_start_ptr,
+                MPI_Offset **fd_end_ptr, MPI_Offset *fd_size_ptr,
+                int striping_unit);
+
+extern
+void PNCIO_Calc_my_req(PNCIO_File *fd, MPI_Offset min_st_offset,
+                MPI_Offset *fd_start, MPI_Offset *fd_end, MPI_Offset fd_size,
+                int nprocs, MPI_Count *count_my_req_procs_ptr,
+                MPI_Count **count_my_req_per_proc_ptr,
+                PNCIO_Access **my_req_ptr, MPI_Aint **buf_idx_ptr);
+
+extern
+void PNCIO_Calc_others_req(PNCIO_File *fd, MPI_Count count_my_req_procs,
+                MPI_Count *count_my_req_per_proc, PNCIO_Access *my_req,
+                int nprocs, int myrank, MPI_Count *count_others_req_procs_ptr,
+                MPI_Count **count_others_req_per_proc_ptr,
+                PNCIO_Access **others_req_ptr);
+
+extern
+void PNCIO_Free_my_req(MPI_Count *count_my_req_per_proc,
+                PNCIO_Access *my_req, MPI_Aint *buf_idx);
+
+extern
+void PNCIO_Free_others_req(MPI_Count *count_others_req_per_proc,
+                PNCIO_Access *others_req);
+
+
+extern
+int PNCIO_Calc_aggregator(PNCIO_File *fd, MPI_Offset off, MPI_Offset min_off,
+                MPI_Offset *len, MPI_Offset fd_size, MPI_Offset *fd_end);
+
+extern
+void PNCIO_Heap_merge(PNCIO_Access *others_req, MPI_Count *count,
+                MPI_Offset *srt_off, MPI_Count *srt_len, MPI_Count *start_pos,
+                int nprocs, int nprocs_recv, MPI_Count total_elements);
+
+/* Generic APIs */
+extern
+int PNCIO_GEN_SetLock(PNCIO_File *fd, int cmd, int type, MPI_Offset offset,
+                int whence, MPI_Offset len);
+
+extern
+int PNCIO_GEN_SetLock64(PNCIO_File *fd, int cmd, int type, MPI_Offset offset,
+                int whence, MPI_Offset len);
+
+extern
+MPI_Offset PNCIO_GEN_WriteStrided(PNCIO_File *fd, const void *buf,
+                PNCIO_View buf_view, MPI_Offset offset);
+
+extern
+MPI_Offset PNCIO_GEN_ReadStrided_naive(PNCIO_File *fd, void *buf,
+                PNCIO_View buf_view, MPI_Offset offset);
+
+extern
+MPI_Offset PNCIO_GEN_ReadStridedColl(PNCIO_File *fd, void *buf,
+                PNCIO_View buf_view, MPI_Offset offset);
+
+extern
+MPI_Offset PNCIO_GEN_WriteStrided_naive(PNCIO_File *fd, const void *buf,
+                PNCIO_View buf_view, MPI_Offset offset);
+
+extern
+MPI_Offset PNCIO_GEN_ReadStrided(PNCIO_File *fd, void *buf,
+                PNCIO_View buf_view, MPI_Offset offset);
+
+extern
+MPI_Offset PNCIO_GEN_WriteStridedColl(PNCIO_File *fd, const void *buf,
+                PNCIO_View buf_view, MPI_Offset offset);
+
+/* Lustre */
+extern
+int PNCIO_Lustre_create(PNCIO_File *fd, int access_mode);
+
+extern
+int PNCIO_Lustre_open(PNCIO_File *fd);
+
+extern
+MPI_Offset PNCIO_LUSTRE_WriteStrided(PNCIO_File *fd, const void *buf,
+                PNCIO_View buf_view, MPI_Offset offset);
+
+extern
+MPI_Offset PNCIO_LUSTRE_WriteStridedColl(PNCIO_File *fd, const void *buf,
+                PNCIO_View buf_view, MPI_Offset offset);
+
+#endif
diff --git a/src/drivers/pncio/pncio_aggregate.c b/src/drivers/pncio/pncio_aggregate.c
new file mode 100644
index 000000000..b75e48d92
--- /dev/null
+++ b/src/drivers/pncio/pncio_aggregate.c
@@ -0,0 +1,560 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <pncio.h>
+
+/* This file contains four functions:
+ *
+ * PNCIO_Calc_aggregator()
+ * PNCIO_Calc_file_domains()
+ * PNCIO_Calc_my_req()
+ * PNCIO_Free_my_req()
+ * PNCIO_Calc_others_req()
+ * PNCIO_Free_others_req()
+ *
+ * The last three of these were originally in ad_read_coll.c, but they are
+ * also shared with ad_write_coll.c.  I felt that they were better kept with
+ * the rest of the shared aggregation code.
+ */
+
+/* Discussion of values available from above:
+ *
+ * MPI_Offset st_offsets[0..nprocs-1]
+ * MPI_Offset end_offsets[0..nprocs-1]
+ *    These contain a list of start and end offsets for each process in
+ *    the communicator.  For example, an access at loc 10, size 10 would
+ *    have a start offset of 10 and end offset of 19.
+ * int nprocs
+ *    number of processors in the collective I/O communicator
+ * MPI_Offset min_st_offset
+ * MPI_Offset fd_start[0..nprocs_for_coll-1]
+ *    starting location of "file domain"; region that a given process will
+ *    perform aggregation for (i.e. actually do I/O)
+ * MPI_Offset fd_end[0..nprocs_for_coll-1]
+ *    start + size - 1 roughly, but it can be less, or 0, in the case of
+ *    uneven distributions
+ */
+
+/* PNCIO_Calc_aggregator()
+ *
+ * The intention here is to implement a function which provides basically
+ * the same functionality as in Rajeev's original version of
+ * PNCIO_Calc_my_req().  He used a ceiling division approach to assign the
+ * file domains, and we use the same approach here when calculating the
+ * location of an offset/len in a specific file domain.  Further we assume
+ * this same distribution when calculating the rank_index, which is later
+ *  used to map to a specific process rank in charge of the file domain.
+ *
+ * A better (i.e. more general) approach would be to use the list of file
+ * domains only.  This would be slower in the case where the
+ * original ceiling division was used, but it would allow for arbitrary
+ * distributions of regions to aggregators.  We'd need to know the
+ * nprocs_for_coll in that case though, which we don't have now.
+ *
+ * Note a significant difference between this function and Rajeev's old code:
+ * this code doesn't necessarily return a rank in the range
+ * 0..nprocs_for_coll; instead you get something in 0..nprocs.  This is a
+ * result of the rank mapping; any set of ranks in the communicator could be
+ * used now.
+ *
+ * Returns an integer representing a rank in the collective I/O communicator.
+ *
+ * The "len" parameter is also modified to indicate the amount of data
+ * actually available in this file domain.
+ */
+int PNCIO_Calc_aggregator(PNCIO_File *fd,
+                          MPI_Offset  off,
+                          MPI_Offset  min_off,
+                          MPI_Offset *len,
+                          MPI_Offset  fd_size,
+                          MPI_Offset *fd_end)
+{
+    int rank_index, rank;
+    MPI_Offset avail_bytes;
+
+    /* get an index into our array of aggregators */
+    rank_index = (int) ((off - min_off + fd_size) / fd_size - 1);
+
+    if (fd->hints->striping_unit > 0) {
+        /* Implementation for file domain alignment. Note fd_end[] have been
+         * aligned with file system lock boundaries when it was produced by
+         * PNCIO_Calc_file_domains().
+         */
+        rank_index = 0;
+        while (off > fd_end[rank_index])
+            rank_index++;
+    }
+
+    /* we index into fd_end with rank_index, and fd_end was allocated to be no
+     * bigger than fd->hins->cb_nodes.   If we ever violate that, we're
+     * overrunning arrays.  Obviously, we should never ever hit this abort */
+    if (rank_index >= fd->hints->cb_nodes || rank_index < 0) {
+        fprintf(stderr,
+                "Error in PNCIO_Calc_aggregator(): rank_index(%d) >= fd->hints->cb_nodes (%d) fd_size="OFFFMT" off="OFFFMT"\n",
+                rank_index, fd->hints->cb_nodes, fd_size, off);
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    /* remember here that even in Rajeev's original code it was the case that
+     * different aggregators could end up with different amounts of data to
+     * aggregate.  here we use fd_end[] to make sure that we know how much
+     * data this aggregator is working with.
+     *
+     * the +1 is to take into account the end vs. length issue.
+     */
+    avail_bytes = fd_end[rank_index] + 1 - off;
+    if (avail_bytes < *len) {
+        /* this file domain only has part of the requested contig. region */
+        *len = avail_bytes;
+    }
+
+    /* map our index to a rank */
+    /* NOTE: FOR NOW WE DON'T HAVE A MAPPING...JUST DO 0..NPROCS_FOR_COLL */
+    rank = fd->hints->ranklist[rank_index];
+
+    return rank;
+}
+
+void PNCIO_Calc_file_domains(MPI_Offset  *st_offsets,
+                             MPI_Offset  *end_offsets,
+                             int          nprocs,
+                             int          nprocs_for_coll,
+                             MPI_Offset  *min_st_offset_ptr,
+                             MPI_Offset **fd_start_ptr,
+                             MPI_Offset **fd_end_ptr,
+                             MPI_Offset  *fd_size_ptr,
+                             int          striping_unit)
+{
+/* Divide the I/O workload among "nprocs_for_coll" processes. This is
+   done by (logically) dividing the file into file domains (FDs); each
+   process may directly access only its own file domain. */
+
+    MPI_Offset min_st_offset, max_end_offset, *fd_start, *fd_end, fd_size;
+    int i;
+
+/* find min of start offsets and max of end offsets of all processes */
+
+    min_st_offset = st_offsets[0];
+    max_end_offset = end_offsets[0];
+
+    for (i = 1; i < nprocs; i++) {
+        min_st_offset = MIN(min_st_offset, st_offsets[i]);
+        max_end_offset = MAX(max_end_offset, end_offsets[i]);
+    }
+
+/* determine the "file domain (FD)" of each process, i.e., the portion of
+   the file that will be "owned" by each process */
+
+/* partition the total file access range equally among nprocs_for_coll
+   processes */
+    fd_size = ((max_end_offset - min_st_offset + 1) + nprocs_for_coll - 1) / nprocs_for_coll;
+    /* ceiling division as in HPF block distribution */
+
+    *fd_start_ptr = (MPI_Offset *) NCI_Malloc(nprocs_for_coll * 2 * sizeof(MPI_Offset));
+    *fd_end_ptr = *fd_start_ptr + nprocs_for_coll;
+
+    fd_start = *fd_start_ptr;
+    fd_end = *fd_end_ptr;
+
+    /* Wei-keng Liao: implementation for fild domain alignment to nearest file
+     * lock boundary (as specified by striping_unit hint).  Could also
+     * experiment with other alignment strategies here */
+    if (striping_unit > 0) {
+        MPI_Offset end_off;
+        int rem_front, rem_back;
+
+        /* align fd_end[0] to the nearest file lock boundary */
+        fd_start[0] = min_st_offset;
+        end_off = fd_start[0] + fd_size;
+        rem_front = end_off % striping_unit;
+        rem_back = striping_unit - rem_front;
+        if (rem_front < rem_back)
+            end_off -= rem_front;
+        else
+            end_off += rem_back;
+        fd_end[0] = end_off - 1;
+
+        /* align fd_end[i] to the nearest file lock boundary */
+        for (i = 1; i < nprocs_for_coll; i++) {
+            fd_start[i] = fd_end[i - 1] + 1;
+            end_off = min_st_offset + fd_size * (i + 1);
+            rem_front = end_off % striping_unit;
+            rem_back = striping_unit - rem_front;
+            if (rem_front < rem_back)
+                end_off -= rem_front;
+            else
+                end_off += rem_back;
+            fd_end[i] = end_off - 1;
+        }
+        fd_end[nprocs_for_coll - 1] = max_end_offset;
+    } else {    /* no hints set: do things the 'old' way */
+        fd_start[0] = min_st_offset;
+        fd_end[0] = min_st_offset + fd_size - 1;
+
+        for (i = 1; i < nprocs_for_coll; i++) {
+            fd_start[i] = fd_end[i - 1] + 1;
+            fd_end[i] = fd_start[i] + fd_size - 1;
+        }
+    }
+
+/* take care of cases in which the total file access range is not
+   divisible by the number of processes. In such cases, the last
+   process, or the last few processes, may have unequal load (even 0).
+   For example, a range of 97 divided among 16 processes.
+   Note that the division is ceiling division. */
+
+    for (i = 0; i < nprocs_for_coll; i++) {
+        if (fd_start[i] > max_end_offset)
+            fd_start[i] = fd_end[i] = -1;
+        if (fd_end[i] > max_end_offset)
+            fd_end[i] = max_end_offset;
+    }
+
+    *fd_size_ptr = fd_size;
+    *min_st_offset_ptr = min_st_offset;
+}
+
+
+/* PNCIO_Calc_my_req() - calculate what portions of the access requests
+ * of this process are located in the file domains of various processes
+ * (including this one)
+ */
+void PNCIO_Calc_my_req(PNCIO_File    *fd,
+                       MPI_Offset     min_st_offset,
+                       MPI_Offset    *fd_start,
+                       MPI_Offset    *fd_end,
+                       MPI_Offset     fd_size,
+                       int            nprocs,
+                       MPI_Count     *count_my_req_procs_ptr,
+                       MPI_Count    **count_my_req_per_proc_ptr,
+                       PNCIO_Access **my_req_ptr,
+                       MPI_Aint     **buf_idx_ptr)
+/* Possibly reconsider if buf_idx's are ok as int's, or should they be aints/offsets?
+   They are used as memory buffer indices so it seems like the 2G limit is in effect */
+{
+    MPI_Count *count_my_req_per_proc, count_my_req_procs, l;
+    MPI_Aint *buf_idx;
+    int proc;
+    size_t memLen, alloc_sz;
+    MPI_Offset fd_len, rem_len, curr_idx, off, *off_ptr;
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Offset *len_ptr;
+#else
+    int *len_ptr;
+#endif
+    PNCIO_Access *my_req;
+
+    *count_my_req_per_proc_ptr = NCI_Calloc(nprocs, sizeof(MPI_Count));
+    count_my_req_per_proc = *count_my_req_per_proc_ptr;
+/* count_my_req_per_proc[i] gives the no. of contig. requests of this
+   process in process i's file domain. calloc initializes to zero.
+   I'm allocating memory of size nprocs, so that I can do an
+   MPI_Alltoall later on.*/
+
+    buf_idx = (MPI_Aint *) NCI_Malloc(nprocs * sizeof(MPI_Aint));
+/* buf_idx is relevant only if buftype_is_contig.
+   buf_idx[i] gives the index into user_buf where data received
+   from proc. i should be placed. This allows receives to be done
+   without extra buffer. This can't be done if buftype is not contig. */
+
+    /* initialize buf_idx to -1 */
+    for (int i = 0; i < nprocs; i++)
+        buf_idx[i] = -1;
+
+    /* one pass just to calculate how much space to allocate for my_req */
+    for (MPI_Count i = 0; i < fd->flat_file.count; i++) {
+        /* short circuit offset/len processing if len == 0
+         *      (zero-byte  read/write */
+        if (fd->flat_file.len[i] == 0)
+            continue;
+        off = fd->flat_file.off[i];
+        fd_len = fd->flat_file.len[i];
+        /* note: we set fd_len to be the total size of the access.  then
+         * PNCIO_Calc_aggregator() will modify the value to return the
+         * amount that was available from the file domain that holds the
+         * first part of the access.
+         */
+        proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_end);
+        count_my_req_per_proc[proc]++;
+
+        /* figure out how much data is remaining in the access (i.e. wasn't
+         * part of the file domain that had the starting byte); we'll take
+         * care of this data (if there is any) in the while loop below.
+         */
+        rem_len = fd->flat_file.len[i] - fd_len;
+
+        while (rem_len != 0) {
+            off += fd_len;      /* point to first remaining byte */
+            fd_len = rem_len;   /* save remaining size, pass to calc */
+            proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len,
+                                         fd_size, fd_end);
+
+            count_my_req_per_proc[proc]++;
+            rem_len -= fd_len;  /* reduce remaining length by amount from fd */
+        }
+    }
+
+/* now allocate space for my_req, offset, and len */
+
+    *my_req_ptr = (PNCIO_Access *) NCI_Malloc(nprocs * sizeof(PNCIO_Access));
+    my_req = *my_req_ptr;
+
+    /* combine offsets and lens into a single regions so we can make one
+     * exchange instead of two later on.  Over-allocate the 'offsets' array and
+     * make 'lens' point to the over-allocated part
+     */
+    memLen = 0;
+    for (int i = 0; i < nprocs; i++)
+        memLen += count_my_req_per_proc[i];
+
+#ifdef HAVE_MPI_LARGE_COUNT
+    alloc_sz = sizeof(MPI_Offset) * 2;
+    my_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz);
+    my_req[0].lens = my_req[0].offsets + memLen;
+#else
+    alloc_sz = sizeof(MPI_Offset) + sizeof(int);
+    my_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz);
+    my_req[0].lens = (int*) (my_req[0].offsets + memLen);
+#endif
+
+    off_ptr = my_req[0].offsets;
+    len_ptr = my_req[0].lens;
+    count_my_req_procs = 0;
+    for (int i = 0; i < nprocs; i++) {
+        if (count_my_req_per_proc[i]) {
+            my_req[i].offsets = off_ptr;
+            off_ptr += count_my_req_per_proc[i];
+            my_req[i].lens = len_ptr;
+            len_ptr += count_my_req_per_proc[i];
+            count_my_req_procs++;
+        }
+        my_req[i].count = 0;    /* will be incremented where needed
+                                 * later */
+    }
+
+/* now fill in my_req */
+    curr_idx = 0;
+    for (MPI_Count i = 0; i < fd->flat_file.count; i++) {
+        /* short circuit offset/len processing if len == 0
+         *      (zero-byte  read/write */
+        if (fd->flat_file.len[i] == 0)
+            continue;
+        off = fd->flat_file.off[i];
+        fd_len = fd->flat_file.len[i];
+        proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len, fd_size, fd_end);
+
+        /* for each separate contiguous access from this process */
+        if (buf_idx[proc] == -1) {
+            assert(curr_idx == (MPI_Aint) curr_idx);
+            buf_idx[proc] = (MPI_Aint) curr_idx;
+        }
+
+        l = my_req[proc].count;
+        curr_idx += fd_len;
+
+        rem_len = fd->flat_file.len[i] - fd_len;
+
+        /* store the proc, offset, and len information in an array
+         * of structures, my_req. Each structure contains the
+         * offsets and lengths located in that process's FD,
+         * and the associated count.
+         */
+        my_req[proc].offsets[l] = off;
+        my_req[proc].lens[l] = fd_len;
+        my_req[proc].count++;
+
+        while (rem_len != 0) {
+            off += fd_len;
+            fd_len = rem_len;
+            proc = PNCIO_Calc_aggregator(fd, off, min_st_offset, &fd_len,
+                                         fd_size, fd_end);
+
+            if (buf_idx[proc] == -1) {
+                assert(curr_idx == (MPI_Aint) curr_idx);
+                buf_idx[proc] = (MPI_Aint) curr_idx;
+            }
+
+            l = my_req[proc].count;
+            curr_idx += fd_len;
+            rem_len -= fd_len;
+
+            my_req[proc].offsets[l] = off;
+            my_req[proc].lens[l] = fd_len;
+            my_req[proc].count++;
+        }
+    }
+
+    *count_my_req_procs_ptr = count_my_req_procs;
+    *buf_idx_ptr = buf_idx;
+}
+
+void PNCIO_Free_my_req(MPI_Count    *count_my_req_per_proc,
+                       PNCIO_Access *my_req,
+                       MPI_Aint     *buf_idx)
+{
+    NCI_Free(count_my_req_per_proc);
+    NCI_Free(my_req[0].offsets);
+    NCI_Free(my_req);
+    NCI_Free(buf_idx);
+}
+
+void PNCIO_Calc_others_req(PNCIO_File    *fd,
+                           MPI_Count      count_my_req_procs,
+                           MPI_Count     *count_my_req_per_proc,
+                           PNCIO_Access  *my_req,
+                           int            nprocs,
+                           int            myrank,
+                           MPI_Count     *count_others_req_procs_ptr,
+                           MPI_Count    **count_others_req_per_proc_ptr,
+                           PNCIO_Access **others_req_ptr)
+{
+/* determine what requests of other processes lie in this process's
+   file domain */
+
+/* count_others_req_procs = number of processes whose requests lie in
+   this process's file domain (including this process itself)
+   count_others_req_per_proc[i] indicates how many separate contiguous
+   requests of proc. i lie in this process's file domain. */
+
+    MPI_Count *count_others_req_per_proc, count_others_req_procs;
+    size_t alloc_sz;
+    int i, j;
+    MPI_Request *requests;
+    PNCIO_Access *others_req;
+    size_t memLen;
+    MPI_Offset *off_ptr;
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Offset *len_ptr;
+    MPI_Count *mem_ptr;
+#else
+    int *len_ptr;
+    MPI_Aint *mem_ptr;
+#endif
+
+/* first find out how much to send/recv and from/to whom */
+    count_others_req_per_proc = NCI_Malloc(nprocs * sizeof(MPI_Count));
+
+    MPI_Alltoall(count_my_req_per_proc, 1, MPI_COUNT,
+                 count_others_req_per_proc, 1, MPI_COUNT, fd->comm);
+
+    *others_req_ptr = (PNCIO_Access *) NCI_Malloc(nprocs * sizeof(PNCIO_Access));
+    others_req = *others_req_ptr;
+
+    memLen = 0;
+    for (i = 0; i < nprocs; i++)
+        memLen += count_others_req_per_proc[i];
+
+#ifdef HAVE_MPI_LARGE_COUNT
+    alloc_sz = sizeof(MPI_Offset) * 2 + sizeof(MPI_Count);
+    others_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz);
+    others_req[0].lens = others_req[0].offsets + memLen;
+    others_req[0].mem_ptrs = (MPI_Count*) (others_req[0].lens + memLen);
+#else
+    alloc_sz = sizeof(MPI_Offset) + sizeof(int) + sizeof(MPI_Aint);
+    others_req[0].offsets = (MPI_Offset *) NCI_Malloc(memLen * alloc_sz);
+    others_req[0].lens = (int *) (others_req[0].offsets + memLen);
+    others_req[0].mem_ptrs = (MPI_Aint*) (others_req[0].lens + memLen);
+#endif
+    off_ptr = others_req[0].offsets;
+    len_ptr = others_req[0].lens;
+    mem_ptr = others_req[0].mem_ptrs;
+
+    count_others_req_procs = 0;
+    for (i = 0; i < nprocs; i++) {
+        if (count_others_req_per_proc[i]) {
+            others_req[i].count = count_others_req_per_proc[i];
+            others_req[i].offsets = off_ptr;
+            off_ptr += count_others_req_per_proc[i];
+            others_req[i].lens = len_ptr;
+            len_ptr += count_others_req_per_proc[i];
+            others_req[i].mem_ptrs = mem_ptr;
+            mem_ptr += count_others_req_per_proc[i];
+            count_others_req_procs++;
+        } else
+            others_req[i].count = 0;
+    }
+    *count_others_req_per_proc_ptr = count_others_req_per_proc;
+
+/* now send the calculated offsets and lengths to respective processes */
+
+    requests = (MPI_Request *)
+        NCI_Malloc((count_my_req_procs + count_others_req_procs) * 2 * sizeof(MPI_Request));
+
+    j = 0;
+    for (i = 0; i < nprocs; i++) {
+        if (others_req[i].count == 0)
+            continue;
+        if (i == myrank) {
+            /* send to self uses memcpy()C, here others_req[i].count == my_req[i].count */
+            memcpy(others_req[i].offsets, my_req[i].offsets,
+                   my_req[i].count * sizeof(MPI_Offset));
+#ifdef HAVE_MPI_LARGE_COUNT
+            memcpy(others_req[i].lens, my_req[i].lens,
+                   my_req[i].count * sizeof(MPI_Offset));
+#else
+            memcpy(others_req[i].lens, my_req[i].lens,
+                   my_req[i].count * sizeof(int));
+#endif
+        }
+        else {
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Irecv_c(others_req[i].offsets, others_req[i].count,
+                        MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]);
+            MPI_Irecv_c(others_req[i].lens, others_req[i].count,
+                        MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]);
+#else
+            assert(others_req[i].count <= 2147483647); /* overflow 4-byte int */
+            MPI_Irecv(others_req[i].offsets, (int)others_req[i].count,
+                      MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]);
+            MPI_Irecv(others_req[i].lens, (int)others_req[i].count,
+                      MPI_INT, i, i + myrank, fd->comm, &requests[j++]);
+#endif
+        }
+    }
+
+    for (i = 0; i < nprocs; i++) {
+        if (my_req[i].count && i != myrank) {
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Isend_c(my_req[i].offsets, my_req[i].count,
+                        MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]);
+            MPI_Isend_c(my_req[i].lens, my_req[i].count,
+                        MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]);
+#else
+            assert(my_req[i].count <= 2147483647); /* overflow 4-byte int */
+            MPI_Isend(my_req[i].offsets, (int)my_req[i].count,
+                      MPI_OFFSET, i, i + myrank, fd->comm, &requests[j++]);
+            MPI_Isend(my_req[i].lens, (int)my_req[i].count,
+                      MPI_INT, i, i + myrank, fd->comm, &requests[j++]);
+#endif
+        }
+    }
+
+    if (j) {
+#ifdef HAVE_MPI_STATUSES_IGNORE
+        MPI_Waitall(j, requests, MPI_STATUSES_IGNORE);
+#else
+        MPI_Status *statuses = (MPI_Status *) NCI_Malloc(j * sizeof(MPI_Status));
+        MPI_Waitall(j, requests, statuses);
+        NCI_Free(statuses);
+#endif
+    }
+
+    NCI_Free(requests);
+
+    *count_others_req_procs_ptr = count_others_req_procs;
+}
+
+void PNCIO_Free_others_req(MPI_Count    *count_others_req_per_proc,
+                           PNCIO_Access *others_req)
+{
+    NCI_Free(count_others_req_per_proc);
+    NCI_Free(others_req[0].offsets);
+    NCI_Free(others_req);
+}
+
diff --git a/src/drivers/pncio/pncio_close.c b/src/drivers/pncio/pncio_close.c
new file mode 100644
index 000000000..4ecc09cc6
--- /dev/null
+++ b/src/drivers/pncio/pncio_close.c
@@ -0,0 +1,72 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>   /* strdup() */
+#include <assert.h>
+#include <sys/errno.h>
+
+#include <mpi.h>
+
+#include "pncio.h"
+
+/*----< PNCIO_File_close() >--------------------------------------------------*/
+int PNCIO_File_close(PNCIO_File *fh)
+{
+    int err = NC_NOERR;
+
+    err = close(fh->fd_sys);
+    if (err != 0)
+        err = ncmpii_error_posix2nc("close");
+
+    if (fh->hints->ranklist != NULL)
+        NCI_Free(fh->hints->ranklist);
+    if (fh->hints != NULL)
+        NCI_Free(fh->hints);
+    if (fh->info != MPI_INFO_NULL)
+        MPI_Info_free(&(fh->info));
+    if (fh->io_buf != NULL)
+        NCI_Free(fh->io_buf);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    int i, rank;
+    double timing[NMEASURES*2], max_t[NMEASURES*2], pread_t;
+    MPI_Count max_ntimes, counter[NMEASURES*2], max_c[NMEASURES*2];
+
+    /* print two-phase I/O timing breakdown */
+    MPI_Comm_rank(fh->comm, &rank);
+    for (i=0; i<NMEASURES; i++) {
+        timing[i]  = fh->write_timing[i];
+        counter[i] = fh->write_counter[i];
+        timing[i+NMEASURES]  = fh->read_timing[i];
+        counter[i+NMEASURES] = fh->read_counter[i];
+    }
+    MPI_Reduce(timing,  max_t, NMEASURES*2, MPI_DOUBLE, MPI_MAX, 0, fh->comm);
+    MPI_Reduce(counter, max_c, NMEASURES*2, MPI_COUNT,  MPI_MAX, 0, fh->comm);
+
+    pread_t = max_t[NMEASURES+2];
+    max_ntimes = max_c[0];
+
+    if (rank == 0 && max_ntimes > 0) {
+        printf("%s: TWO-PHASE write init %5.2f pwrite %5.2f pread %5.2f post %5.2f hsort %5.2f comm %5.2f collw %5.2f\n",
+        __func__, max_t[1], max_t[2], pread_t, max_t[4], max_t[5], max_t[3], max_t[0]);
+        printf("%s: TWO-PHASE write ntimes %lld check_hole %lld (total_num %lld nrecv %lld) no check %lld (total_num %lld nrecv %lld)\n",
+        __func__, max_c[0], max_c[1], max_c[2], max_c[3], max_c[4], max_c[5], max_c[6]);
+    }
+
+    max_ntimes = max_c[NMEASURES];
+
+    if (rank == 0 && max_ntimes > 0)
+        printf("%s: TWO-PHASE read  init %5.2f pread  %5.2f post %5.2f wait %5.2f collr %5.2f ntimes %lld\n",
+        __func__, max_t[NMEASURES+1], max_t[NMEASURES+2], max_t[NMEASURES+4], max_t[NMEASURES+3], max_t[NMEASURES+0], max_ntimes);
+#endif
+
+    return err;
+}
diff --git a/src/drivers/pncio/pncio_delete.c b/src/drivers/pncio/pncio_delete.c
new file mode 100644
index 000000000..514f3a325
--- /dev/null
+++ b/src/drivers/pncio/pncio_delete.c
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdlib.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h> /* unlink() */
+#endif
+
+#include <common.h>
+#include "pncio.h"
+
+/*----< PNCIO_File_delete() >-------------------------------------------------*/
+int PNCIO_File_delete(const char *filename)
+{
+    int err = NC_NOERR;
+    char *path = ncmpii_remove_file_system_type_prefix(filename);
+
+    err = unlink(path);
+    if (err != 0)
+        err = ncmpii_error_posix2nc("unlink");
+
+    return err;
+}
+
diff --git a/src/drivers/pncio/pncio_fstype.c b/src/drivers/pncio/pncio_fstype.c
new file mode 100644
index 000000000..9713b7011
--- /dev/null
+++ b/src/drivers/pncio/pncio_fstype.c
@@ -0,0 +1,233 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>   /* readlink() */
+#include <string.h>   /* strdup() */
+#include <assert.h>
+#include <sys/errno.h>
+#include <fcntl.h>      /* open(), O_CREAT */
+#include <sys/types.h>  /* open() */
+#include <libgen.h>     /* basename() */
+
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#endif
+#ifndef PATH_MAX
+#define PATH_MAX 65535
+#endif
+
+#ifdef HAVE_SYS_VFS_H
+#include <sys/vfs.h>
+#endif
+#ifdef HAVE_SYS_STATVFS_H
+#include <sys/statvfs.h>
+#endif
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h> /* struct statfs */
+#endif
+#ifdef HAVE_SYS_MOUNT_H
+#include <sys/mount.h> /* struct statfs */
+#endif
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h> /* open(), fstat(), lstat(), stat() */
+#endif
+
+#include <mpi.h>
+
+#include "pncio.h"
+
+/* In a strict ANSI environment, S_ISLNK may not be defined. Fix that here.
+ * We assume that S_ISLNK is *always* defined as a macro. If that is not
+ * universally true, then add a test to the configure that tries to link
+ * a program that references S_ISLNK
+ */
+#if !defined(S_ISLNK)
+#if defined(S_IFLNK)
+/* Check for the link bit */
+#define S_ISLNK(mode) ((mode) & S_IFLNK)
+#else
+/* no way to check if it is a link, so say false */
+#define S_ISLNK(mode) 0
+#endif
+#endif /* !(S_ISLNK) */
+
+/* Returns a string, the parent directory of a given filename.
+ * The caller should free the memory located returned by this subroutine.
+ */
+static
+void parentdir(const char *filename, char **dirnamep)
+{
+    int err;
+    char *dir = NULL, *slash;
+    struct stat statbuf;
+
+    err = lstat(filename, &statbuf);
+
+    if (err || (!S_ISLNK(statbuf.st_mode))) {
+        /* No such file, or file is not a link; these are the "normal" cases
+         * where we can just return the parent directory.
+         */
+        dir = NCI_Strdup(filename);
+    } else {
+        /* filename is a symlink. We've presumably already tried to stat it
+         * and found it to be missing (dangling link), but this code doesn't
+         * care if the target is really there or not.
+         */
+        ssize_t namelen;
+        char *linkbuf;
+
+        linkbuf = NCI_Malloc(PATH_MAX + 1);
+        namelen = readlink(filename, linkbuf, PATH_MAX + 1);
+        if (namelen == -1) {
+            /* Something strange has happened between the time that we
+             * determined that this was a link and the time that we attempted
+             * to read it; punt and use the old name.
+             */
+            dir = NCI_Strdup(filename);
+        } else {
+            /* successfully read the link */
+            linkbuf[namelen] = '\0';    /* readlink doesn't null terminate */
+            dir = NCI_Strdup(linkbuf);
+        }
+        NCI_Free(linkbuf);
+    }
+
+    slash = strrchr(dir, '/');
+    if (!slash)
+        strncpy(dir, ".", 2);
+    else {
+        if (slash == dir)
+            *(dir + 1) = '\0';
+        else
+            *slash = '\0';
+    }
+
+    *dirnamep = dir;
+    return;
+}
+
+#define UNKNOWN_SUPER_MAGIC (0xDEADBEEF)
+#ifndef LL_SUPER_MAGIC
+#define LL_SUPER_MAGIC 0x0BD00BD0
+#endif
+
+static int check_statfs(const char *filename, int64_t * file_id)
+{
+    int err = 0;
+
+#ifdef HAVE_STRUCT_STATVFS_WITH_F_BASETYPE
+    /* rare: old solaris machines */
+    struct statvfs vfsbuf;
+#endif
+#if defined(HAVE_STRUCT_STATFS_F_TYPE) || defined(HAVE_STRUCT_STATFS_F_FSTYPENAME)
+    /* common fs-detection logic for any modern POSIX-compliant environment,
+     * with the one wrinkle that some platforms (Darwin, BSD) give us a file
+     * system as a string, not an identifier */
+    struct statfs fsbuf;
+#endif
+
+    *file_id = UNKNOWN_SUPER_MAGIC;
+
+#ifdef HAVE_STRUCT_STATVFS_WITH_F_BASETYPE
+    err = statvfs(filename, &vfsbuf);
+    if (err == 0)
+        *file_id = vfsbuf.f_basetype;
+#endif
+
+    /* remember above how I said 'statfs with f_type' was the common linux-y
+     * way to report file system type?  Darwin (and probably the BSDs) *also*
+     * uses f_type but it is "reserved" and does not give us anything
+     * meaningful.  Fine.  If configure detects f_type we'll use it here and on
+     * those "reserved" platforms we'll ignore that result and check the
+     * f_fstypename field.
+     */
+
+#ifdef HAVE_STRUCT_STATFS_F_TYPE
+    err = statfs(filename, &fsbuf);
+    if (err == 0) {
+        *file_id = fsbuf.f_type;
+        return 0;
+    }
+#endif
+
+#ifdef HAVE_STRUCT_STATFS_F_FSTYPENAME
+    /* these stat routines store the file system type in a string */
+    err = statfs(filename, &fsbuf);
+    if (err == 0 && !strncasecmp(fsbuf.f_fstypename, "lustre", 6)) {
+        *file_id = LL_SUPER_MAGIC;
+        return 0;
+    }
+#endif
+
+#ifdef HAVE_STRUCT_STAT_ST_FSTYPE
+    struct stat sbuf;
+    err = stat(filename, &sbuf);
+    if (err == 0) {
+        *file_id = sbuf.st_fstype;
+        return 0;
+    }
+#endif
+    return err;
+}
+
+/* Check if file system type from file name, using a system-dependent function
+ * call.
+ */
+int PNCIO_FileSysType(const char *filename)
+{
+
+    int err, retry_cnt;
+    int64_t file_id=UNKNOWN_SUPER_MAGIC;
+
+    char *colon = strchr(filename, ':');
+    if (colon != NULL) { /* there is a prefix end with : */
+        if (!strncmp(filename, "lustre", 6))
+            return PNCIO_LUSTRE;
+        else if (!strncmp(filename, "ufs", 3))
+            return PNCIO_UFS;
+        else
+            return 0;
+    }
+#ifdef MIMIC_LUSTRE
+    return PNCIO_LUSTRE;
+#endif
+
+    /* NFS can get stuck and end up returning ESTALE "forever" */
+
+#define MAX_ESTALE_RETRY 10000
+
+    retry_cnt = 0;
+    do {
+        err = check_statfs(filename, &file_id);
+    } while (err && (errno == ESTALE) && retry_cnt++ < MAX_ESTALE_RETRY);
+
+    if (err) {
+        /* ENOENT may be returned in two cases:
+         * 1) no directory entry for "filename"
+         * 2) "filename" is a dangling symbolic link
+         *
+         * parentdir() tries to deal with both cases.
+         */
+        if (errno == ENOENT) {
+            char *dir;
+            parentdir(filename, &dir);
+            err = check_statfs(dir, &file_id);
+            NCI_Free(dir);
+        } else
+            return 0;
+    }
+
+    if (file_id == LL_SUPER_MAGIC)
+        return PNCIO_LUSTRE;
+    else
+        return PNCIO_UFS; /* UFS support if we don't know what else to use */
+}
+
diff --git a/src/drivers/pncio/pncio_hints.c b/src/drivers/pncio/pncio_hints.c
new file mode 100644
index 000000000..0e7db29a3
--- /dev/null
+++ b/src/drivers/pncio/pncio_hints.c
@@ -0,0 +1,326 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <sys/errno.h>
+
+#include <mpi.h>
+
+#include <pnc_debug.h>
+#include <common.h>
+#include "pncio.h"
+
+/*----< PNCIO_File_get_info() >-----------------------------------------------*/
+int PNCIO_File_get_info(PNCIO_File *fd,
+                        MPI_Info   *info_used)
+{
+    int err;
+
+    err = MPI_Info_dup(fd->info, info_used);
+    if (err == MPI_SUCCESS)
+        err = NC_NOERR;
+    else
+        err = ncmpii_error_mpi2nc(err, "MPI_Info_dup");
+
+    return err;
+}
+
+/*----< Info_check_and_install_int() >---------------------------------------*/
+static
+int Info_check_and_install_int(PNCIO_File *fd,
+                               MPI_Info    info,
+                               const char *key,
+                               int        *local_cache)
+{
+    int intval, tmp_val, flag, ret = 0;
+    char value[MPI_MAX_INFO_VAL + 1];
+
+    MPI_Info_get(info, key, MPI_MAX_INFO_VAL, value, &flag);
+    if (flag) {
+        intval = atoi(value);
+        tmp_val = intval;
+
+        MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
+        /* --BEGIN ERROR HANDLING-- */
+        if (tmp_val != intval) {
+            ret = ncmpii_error_mpi2nc(MPI_ERR_NOT_SAME, __func__);
+            goto fn_exit;
+        }
+        /* --END ERROR HANDLING-- */
+
+        MPI_Info_set(fd->info, key, value);
+        /* some file systems do not cache hints in the fd struct */
+        if (local_cache != NULL)
+            *local_cache = intval;
+    }
+fn_exit:
+    return ret;
+}
+
+/*----< Info_check_and_install_enabled() >-----------------------------------*/
+static
+int Info_check_and_install_enabled(PNCIO_File *fd,
+                                   MPI_Info    info,
+                                   const char *key,
+                                   int        *local_cache)
+{
+    int tmp_val, flag, ret = 0;
+    char value[MPI_MAX_INFO_VAL + 1];
+
+    MPI_Info_get(info, key, MPI_MAX_INFO_VAL, value, &flag);
+    if (flag) {
+        if (!strcmp(value, "enable") || !strcmp(value, "ENABLE")) {
+            MPI_Info_set(fd->info, key, value);
+            *local_cache = PNCIO_HINT_ENABLE;
+        } else if (!strcmp(value, "disable") || !strcmp(value, "DISABLE")) {
+            MPI_Info_set(fd->info, key, value);
+            *local_cache = PNCIO_HINT_DISABLE;
+        } else if (!strcmp(value, "automatic") || !strcmp(value, "AUTOMATIC")) {
+            MPI_Info_set(fd->info, key, value);
+            *local_cache = PNCIO_HINT_AUTO;
+            /* treat the user-provided string like "enabled":  either it is a
+             * hint ROMIO knows about and can support it, or ROMIO will not
+             * return the hint at all in the MPI_File_get_info info object
+             */
+        } else if (!strcmp(value, "requested") || !strcmp(value, "REQUESTED")) {
+            MPI_Info_set(fd->info, key, "enable");
+            *local_cache = PNCIO_HINT_ENABLE;
+        }
+
+        tmp_val = *local_cache;
+
+        MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
+        /* --BEGIN ERROR HANDLING-- */
+        if (tmp_val != *local_cache) {
+            ret = ncmpii_error_mpi2nc(MPI_ERR_NOT_SAME, __func__);
+            goto fn_exit;
+        }
+        /* --END ERROR HANDLING-- */
+    }
+fn_exit:
+    return ret;
+}
+
+/*----< Info_check_and_install_true() >--------------------------------------*/
+static
+int Info_check_and_install_true(PNCIO_File *fd,
+                                MPI_Info    info,
+                                const char *key,
+                                int        *local_cache)
+{
+    int flag, tmp_val, ret = 0;
+    char value[MPI_MAX_INFO_VAL + 1];
+
+    MPI_Info_get(info, key, MPI_MAX_INFO_VAL, value, &flag);
+    if (flag) {
+        if (!strcmp(value, "true") || !strcmp(value, "TRUE")) {
+            MPI_Info_set(fd->info, key, value);
+            *local_cache = 1;
+        } else if (!strcmp(value, "false") || !strcmp(value, "FALSE")) {
+            MPI_Info_set(fd->info, key, value);
+            *local_cache = 0;
+        }
+        tmp_val = *local_cache;
+
+        MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm);
+        /* --BEGIN ERROR HANDLING-- */
+        if (tmp_val != *local_cache) {
+            ret = ncmpii_error_mpi2nc(MPI_ERR_NOT_SAME, __func__);
+            goto fn_exit;
+        }
+        /* --END ERROR HANDLING-- */
+    }
+fn_exit:
+    return ret;
+}
+
+#if 0
+/*----< Info_check_and_install_str() >---------------------------------------*/
+static
+int Info_check_and_install_str(PNCIO_File   *fd,
+                               MPI_Info     info,
+                               const char  *key,
+                               char       **local_cache)
+{
+    int flag, ret = 0;
+    size_t len;
+    char value[MPI_MAX_INFO_VAL + 1];
+
+    MPI_Info_get(info, key, MPI_MAX_INFO_VAL, value, &flag);
+    if (flag) {
+        MPI_Info_set(fd->info, key, value);
+        len = (strlen(value) + 1) * sizeof(char);
+        *local_cache = NCI_Malloc(len);
+        if (*local_cache == NULL) {
+            ret = NC_ENOMEM;
+            goto fn_exit;
+        }
+        strncpy(*local_cache, value, len);
+    }
+fn_exit:
+    return ret;
+}
+#endif
+
+/*----< PNCIO_File_SetInfo() >------------------------------------------------*/
+/* For PnetCDF, a file info object can only be passed to PnetCDF at file create
+ * or open call, i.e. I/O hints cannot be changed after file create/open.
+ *
+ * When users_info == MPI_INFO_NULL, this subroutine is an independent call.
+ * When users_info != MPI_INFO_NULL, this subroutine is a collective call,
+ * because it calls Info_check_and_install_xxx(), which checks the consistency
+ * of all hints values set in user's info object.
+ *
+ * TODO: instead of sync each hint, a better implementation is to have root
+ *       bcast all hints and let each process checks inconsistency locally.
+ */
+int
+PNCIO_File_SetInfo(PNCIO_File *fd,
+                   MPI_Info    users_info)
+{
+    int nprocs;
+    char value[MPI_MAX_INFO_VAL + 1];
+
+    if (users_info == MPI_INFO_NULL)
+        return NC_NOERR;
+
+    MPI_Comm_size(fd->comm, &nprocs);
+
+    /* initialize fd->info and hints to default values */
+    MPI_Info_create(&(fd->info));
+
+    /* buffer size for collective I/O */
+    MPI_Info_set(fd->info, "cb_buffer_size", PNCIO_CB_BUFFER_SIZE_DFLT);
+    fd->hints->cb_buffer_size = atoi(PNCIO_CB_BUFFER_SIZE_DFLT);
+
+    /* default is to let pncio automatically decide whether or not to use
+     * collective buffering
+     */
+    MPI_Info_set(fd->info, "romio_cb_read", "automatic");
+    fd->hints->cb_read = PNCIO_HINT_AUTO;
+    MPI_Info_set(fd->info, "romio_cb_write", "automatic");
+    fd->hints->cb_write = PNCIO_HINT_AUTO;
+
+    /* cb_nodes may be set later right after file open call */
+    fd->hints->cb_nodes = 0;
+
+    /* hint indicating that no indep. I/O will be performed on this file */
+    MPI_Info_set(fd->info, "romio_no_indep_rw", "false");
+    fd->hints->no_indep_rw = 0;
+
+    /* buffer size for data sieving in independent reads */
+    MPI_Info_set(fd->info, "ind_rd_buffer_size", PNCIO_IND_RD_BUFFER_SIZE_DFLT);
+    fd->hints->ind_rd_buffer_size = atoi(PNCIO_IND_RD_BUFFER_SIZE_DFLT);
+
+    /* buffer size for data sieving in independent writes */
+    MPI_Info_set(fd->info, "ind_wr_buffer_size", PNCIO_IND_WR_BUFFER_SIZE_DFLT);
+    fd->hints->ind_wr_buffer_size = atoi(PNCIO_IND_WR_BUFFER_SIZE_DFLT);
+
+    /* default is to let romio automatically decide when to use data
+     * sieving
+     */
+    MPI_Info_set(fd->info, "romio_ds_read", "automatic");
+    fd->hints->ds_read = PNCIO_HINT_AUTO;
+    MPI_Info_set(fd->info, "romio_ds_write", "automatic");
+    fd->hints->ds_write = PNCIO_HINT_AUTO;
+
+    /* File striping parameters will be retrieved from the file system set,
+     * once the file is opened. These parameters can also be customized by
+     * a user's info. Thus, default values used below are to indicate
+     * whether or not they have been customized by the users.
+     */
+    fd->hints->striping_unit = 0;
+    fd->hints->striping_factor = 0;
+    fd->hints->start_iodevice = -1;
+    /* Lustre overstriping ratio. 0 or 1 means disabled */
+    fd->hints->fs_hints.lustre.overstriping_ratio = 1;
+
+    /* add in user's info --------------------------------------------------*/
+    Info_check_and_install_int(fd, users_info, "cb_buffer_size",
+                               &fd->hints->cb_buffer_size);
+
+    /* enable/disable collective buffering */
+    Info_check_and_install_enabled(fd, users_info, "romio_cb_read",
+                                   &fd->hints->cb_read);
+    if (fd->hints->cb_read == PNCIO_HINT_DISABLE) {
+        /* romio_cb_read overrides no_indep_rw */
+        MPI_Info_set(fd->info, "romio_no_indep_rw", "false");
+        fd->hints->no_indep_rw = PNCIO_HINT_DISABLE;
+    }
+
+    Info_check_and_install_enabled(fd, users_info, "romio_cb_write",
+                                   &fd->hints->cb_write);
+    if (fd->hints->cb_write == PNCIO_HINT_DISABLE) {
+        /* romio_cb_write overrides no_indep_rw */
+        MPI_Info_set(fd->info, "romio_no_indep_rw", "false");
+        fd->hints->no_indep_rw = PNCIO_HINT_DISABLE;
+    }
+
+    /* user intends to call collective I/O APIs only */
+    Info_check_and_install_true(fd, users_info, "romio_no_indep_rw",
+                                &fd->hints->no_indep_rw);
+    if (fd->hints->no_indep_rw == 1) {
+        /* if 'no_indep_rw' set, also hint that we will do
+         * collective buffering: if we aren't doing independent io,
+         * then we have to do collective  */
+        MPI_Info_set(fd->info, "romio_cb_write", "enable");
+        MPI_Info_set(fd->info, "romio_cb_read", "enable");
+        fd->hints->cb_read = PNCIO_HINT_ENABLE;
+        fd->hints->cb_write = PNCIO_HINT_ENABLE;
+    }
+
+    /* enable/disable data sieving */
+    Info_check_and_install_enabled(fd, users_info, "romio_ds_read",
+                                   &fd->hints->ds_read);
+    Info_check_and_install_enabled(fd, users_info, "romio_ds_write",
+                                   &fd->hints->ds_write);
+
+    /* number of I/O aggregators */
+    Info_check_and_install_int(fd, users_info, "cb_nodes",
+                               &fd->hints->cb_nodes);
+    /* check ill value */
+    if (fd->hints->cb_nodes > 0 && fd->hints->cb_nodes <= nprocs) {
+        snprintf(value, MPI_MAX_INFO_VAL + 1, "%d", fd->hints->cb_nodes);
+        MPI_Info_set(fd->info, "cb_nodes", value);
+    }
+    else {
+        fd->hints->cb_nodes = 0;
+        MPI_Info_set(fd->info, "cb_nodes", "0");
+    }
+
+    Info_check_and_install_int(fd, users_info, "ind_wr_buffer_size",
+                               &fd->hints->ind_wr_buffer_size);
+    Info_check_and_install_int(fd, users_info, "ind_rd_buffer_size",
+                               &fd->hints->ind_rd_buffer_size);
+
+    /* file striping configuration */
+    Info_check_and_install_int(fd, users_info, "striping_unit",
+                               &fd->hints->striping_unit);
+
+    Info_check_and_install_int(fd, users_info, "striping_factor",
+                               &fd->hints->striping_factor);
+
+    Info_check_and_install_int(fd, users_info, "start_iodevice",
+                               &fd->hints->start_iodevice);
+
+    /* Lustre overstriping ratio. 0 or 1 means disabled */
+    Info_check_and_install_int(fd, users_info, "lustre_overstriping_ratio",
+                     &fd->hints->fs_hints.lustre.overstriping_ratio);
+
+    /* PnetCDF ignores the following hints.
+     *    cb_config_list
+     *    deferred_open
+     */
+
+    return NC_NOERR;
+}
+
diff --git a/src/drivers/pncio/pncio_lock.c b/src/drivers/pncio/pncio_lock.c
new file mode 100644
index 000000000..a78d181db
--- /dev/null
+++ b/src/drivers/pncio/pncio_lock.c
@@ -0,0 +1,156 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <fcntl.h>
+
+#include <pncio.h>
+
+static
+const char *GEN_flock_cmd_to_string(int cmd)
+{
+    switch (cmd) {
+#ifdef F_GETLK64
+        case F_GETLK64:
+            return "F_GETLK64";
+#else
+        case F_GETLK:
+            return "F_GETLK";
+#endif
+#ifdef F_SETLK64
+        case F_SETLK64:
+            return "F_SETLK64";
+#else
+        case F_SETLK:
+            return "F_SETLK";
+#endif
+#ifdef F_SETLKW64
+        case F_SETLKW64:
+            return "F_SETLKW64";
+#else
+        case F_SETLKW:
+            return "F_SETLKW";
+#endif
+        default:
+            return "UNEXPECTED";
+    }
+}
+
+static
+const char *GEN_flock_type_to_string(int type)
+{
+    switch (type) {
+        case F_RDLCK:
+            return "F_RDLCK";
+        case F_WRLCK:
+            return "F_WRLCK";
+        case F_UNLCK:
+            return "F_UNLOCK";
+        default:
+            return "UNEXPECTED";
+    }
+}
+
+int PNCIO_GEN_SetLock(PNCIO_File *fd, int cmd, int type, MPI_Offset offset, int whence,
+                      MPI_Offset len)
+{
+    FDTYPE fd_sys = fd->fd_sys;
+    int err, error_code, err_count = 0, sav_errno;
+    struct flock lock;
+
+    if (len == 0)
+        return MPI_SUCCESS;
+
+
+    /* Depending on the compiler flags and options, struct flock
+     * may not be defined with types that are the same size as
+     * MPI_Offsets.  */
+/* FIXME: This is a temporary hack until we use flock64 where
+   available. It also doesn't fix the broken Solaris header sys/types.h
+   header file, which declares off_t as a UNION ! Configure tests to
+   see if the off64_t is a union if large file support is requested;
+   if so, it does not select large file support.
+*/
+#ifdef NEEDS_INT_CAST_WITH_FLOCK
+    lock.l_type = type;
+    lock.l_start = (int) offset;
+    lock.l_whence = whence;
+    lock.l_len = (int) len;
+#else
+    lock.l_type = type;
+    lock.l_whence = whence;
+    lock.l_start = offset;
+    lock.l_len = len;
+#endif
+
+    sav_errno = errno;  /* save previous errno in case we recover from retryable errors */
+    errno = 0;
+    do {
+        err = fcntl(fd_sys, cmd, &lock);
+    } while (err && ((errno == EINTR) || ((errno == EINPROGRESS) && (++err_count < 10000))));
+
+    if (err && (errno != EBADF)) {
+        /* FIXME: This should use the error message system,
+         * especially for MPICH */
+        fprintf(stderr,
+                "This requires fcntl(2) to be implemented. As of 8/25/2011 it is not. Generic MPICH Message: File locking failed in PNCIO_GEN_SetLock(fd %X,cmd %s/%X,type %s/%X,whence %X) with return value %X and errno %X.\n"
+                "- If the file system is NFS, you need to use NFS version 3, ensure that the lockd daemon is running on all the machines, and mount the directory with the 'noac' option (no attribute caching).\n"
+                "- If the file system is LUSTRE, ensure that the directory is mounted with the 'flock' option.\n",
+                fd_sys, GEN_flock_cmd_to_string(cmd), cmd,
+                GEN_flock_type_to_string(type), type, whence, err, errno);
+        perror("PNCIO_GEN_SetLock:");
+        fprintf(stderr, "PNCIO_GEN_SetLock:offset %llu, length %llu\n", (unsigned long long) offset,
+                (unsigned long long) len);
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    if (!err)   /* report fcntl failure errno's (EBADF), otherwise */
+        errno = sav_errno;      /* restore previous errno in case we recovered from retryable errors */
+
+    error_code = (err == 0) ? MPI_SUCCESS : MPI_ERR_UNKNOWN;
+    return error_code;
+}
+
+int PNCIO_GEN_SetLock64(PNCIO_File *fd, int cmd, int type, MPI_Offset offset, int whence,
+                        MPI_Offset len)
+{
+    FDTYPE fd_sys = fd->fd_sys;
+    int err, error_code;
+#ifdef _LARGEFILE64_SOURCE
+    struct flock64 lock;
+#else
+    struct flock lock;
+#endif
+
+    if (len == 0)
+        return MPI_SUCCESS;
+
+    lock.l_type = type;
+    lock.l_start = offset;
+    lock.l_whence = whence;
+    lock.l_len = len;
+
+    do {
+        err = fcntl(fd_sys, cmd, &lock);
+    } while (err && (errno == EINTR));
+
+    if (err && (errno != EBADF)) {
+        fprintf(stderr,
+                "File locking failed in PNCIO_GEN_SetLock64(fd %X,cmd %s/%X,type %s/%X,whence %X) with return value %X and errno %X.\n"
+                "If the file system is NFS, you need to use NFS version 3, ensure that the lockd daemon is running on all the machines, and mount the directory with the 'noac' option (no attribute caching).\n",
+                fd_sys, GEN_flock_cmd_to_string(cmd), cmd,
+                GEN_flock_type_to_string(type), type, whence, err, errno);
+        perror("PNCIO_GEN_SetLock64:");
+        fprintf(stderr, "PNCIO_GEN_SetLock:offset %llu, length %llu\n", (unsigned long long) offset,
+                (unsigned long long) len);
+        MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+
+    error_code = (err == 0) ? MPI_SUCCESS : MPI_ERR_UNKNOWN;
+    return error_code;
+}
diff --git a/src/drivers/pncio/pncio_lustre_open.c b/src/drivers/pncio/pncio_lustre_open.c
new file mode 100644
index 000000000..d9a1692a6
--- /dev/null
+++ b/src/drivers/pncio/pncio_lustre_open.c
@@ -0,0 +1,1138 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/errno.h>
+
+#include <fcntl.h>      /* open(), O_CREAT */
+#include <sys/types.h>  /* open() */
+#include <libgen.h>     /* dirname() */
+
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#endif
+#ifndef PATH_MAX
+#define PATH_MAX 65535
+#endif
+
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h> /* open(), fstat() */
+#endif
+
+#include <mpi.h>
+
+#include "pncio.h"
+
+#ifdef MIMIC_LUSTRE
+#define xstr(s) str(s)
+#define str(s) #s
+#define STRIPE_SIZE 64
+#define STRIPE_COUNT 4
+#endif
+
+#ifdef HAVE_LUSTRE
+/* /usr/include/lustre/lustreapi.h
+ * /usr/include/linux/lustre/lustre_user.h
+ */
+#include <lustre/lustreapi.h>
+
+#define PNETCDF_LUSTRE_DEBUG
+// #define PNETCDF_LUSTRE_DEBUG_VERBOSE
+
+#define PATTERN_STR(pattern, int_str) ( \
+    (pattern == LLAPI_LAYOUT_DEFAULT)      ? "LLAPI_LAYOUT_DEFAULT" : \
+    (pattern == LLAPI_LAYOUT_RAID0)        ? "LLAPI_LAYOUT_RAID0" : \
+    (pattern == LLAPI_LAYOUT_WIDE)         ? "LLAPI_LAYOUT_WIDE" : \
+    (pattern == LLAPI_LAYOUT_MDT)          ? "LLAPI_LAYOUT_MDT" : \
+    (pattern == LLAPI_LAYOUT_OVERSTRIPING) ? "LLAPI_LAYOUT_OVERSTRIPING" : \
+    (pattern == LLAPI_LAYOUT_SPECIFIC)     ? "LLAPI_LAYOUT_SPECIFIC" : \
+    int_str)
+
+#define PRINT_LAYOUT(val) { \
+    char int_str[32]; \
+    snprintf(int_str, 32, "%lu", val); \
+    printf("\t%-14s = %-25s (0x%lx)\n",#val,PATTERN_STR(val, int_str),val); \
+}
+
+/*----< get_total_avail_osts() >---------------------------------------------*/
+static
+int get_total_avail_osts(const char *filename)
+{
+    char *dirc=NULL, *dname, *tail, **members=NULL, *buffer=NULL;
+    char pool_name[64], fsname[64], full_pool_name[128];
+    int err, dd, num_members=0;
+    int max_members = 2048;    /* Maximum number of members to retrieve */
+    int buffer_size = 1048576; /* Buffer size for member names */
+    struct llapi_layout *layout=NULL;
+
+    dirc = NCI_Strdup(filename);
+
+    struct stat sb;
+    if (stat(filename, &sb) == 0 && S_ISDIR(sb.st_mode))
+        dname = dirc;
+    else
+        /* find the parent folder name */
+        dname = dirname(dirc);
+
+    dd = open(dname, O_RDONLY, 0600);
+    if (dd < 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        fprintf(stderr,"Error at %s (%d) fails to open folder %s (%s)\n",
+                __FILE__,__LINE__, dname, strerror(errno));
+#endif
+        goto err_out;
+    }
+
+    /* obtain Lustre layout object */
+    layout = llapi_layout_get_by_fd(dd, LLAPI_LAYOUT_GET_COPY);
+    if (layout == NULL) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        fprintf(stderr,"Error at %s (%d) llapi_layout_get_by_fd() fails (%s)\n",
+                __FILE__, __LINE__,strerror(errno));
+#endif
+        goto err_out;
+    }
+
+    /* find the pool name */
+    err = llapi_layout_pool_name_get(layout, pool_name, sizeof(pool_name)-1);
+    if (err < 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        fprintf(stderr,"Error at %s (%d) llapi_layout_pool_name_get() fails (%s)\n",
+                __FILE__, __LINE__,strerror(errno));
+#endif
+        goto err_out;
+    }
+    else if (pool_name[0] == '\0') {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        fprintf(stderr,"%s at %d: %s has NO Pool Name\n",__FILE__, __LINE__,dname);
+#endif
+        goto err_out;
+    }
+    /* For example, Perlmutter @NERSC, pool_name "original" is returned */
+
+    /* Using pool_name returned from llapi_layout_pool_name_get() is not enough
+     * when calling  llapi_get_poolmembers(). We need to prepend it with
+     * 'fsname', which can be obtained by calling llapi_getname(). Note that
+     * console command 'lfs getname -n' returns fsname. For example, on
+     * Perlmutter @NERSC:
+     *    login39::~/Lustre(12:52) #1165  lfs getname -n $SCRATCH/dummy
+     *    scratch
+     */
+    err = llapi_getname(dname, fsname, 63);
+    if (err < 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        fprintf(stderr,"Error at %s (%d) llapi_getname() fails (%s)\n",
+                __FILE__, __LINE__,strerror(errno));
+#endif
+        goto err_out;
+    }
+
+    /* When dname is a folder, fsname returned from llapi_getname() may contain
+     * a trailing ID, e.g.  scratch-ffff9ca88d9bd800. Must remove the trailing
+     * ID, otherwise llapi_get_poolmembers() is not able to find it.
+     */
+    tail = strchr(fsname, '-');
+    if (tail != NULL) *tail = '\0';
+
+    /* In case either pool_name and fsname are empty. For example, on Polaris
+     * @ALCF, the returned pool_name is empty, but fsname is not.
+     */
+    if (pool_name[0] == '\0' && fsname[0] == '\0')
+        goto err_out;
+    else if (pool_name[0] == '\0')
+        strcpy(full_pool_name, fsname);
+    else if (fsname[0] == '\0')
+        strcpy(full_pool_name, pool_name);
+    else
+        sprintf(full_pool_name, "%s.%s", fsname, pool_name);
+
+#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE
+    printf("%s at %d: file=%s dir=%s pool=%s fsname=%s full_pool_name=%s\n",
+           __func__,__LINE__, filename,dname,pool_name,fsname,full_pool_name);
+#endif
+
+    /* Allocate memory for the members and buffer */
+    members = (char **)NCI_Malloc(max_members * sizeof(char *));
+    buffer = (char *)NCI_Malloc(buffer_size);
+
+    /* obtain pool's info */
+    num_members = llapi_get_poolmembers(full_pool_name, members, max_members,
+                                        buffer, buffer_size);
+#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE
+    if (num_members > 0) {
+        int i, min_nmembers = MIN(num_members, 10);
+        printf("%s at %d: Found %d members for pool '%s':\n",
+               __func__,__LINE__,num_members, pool_name);
+        printf("\tFirst %d OSTs and last are\n",min_nmembers);
+        for (i=0; i<min_nmembers; i++)
+            printf("\t\tmember[%3d] %s\n",i,members[i]);
+        printf("\t ...\tmember[%3d] %s\n",num_members-1,members[num_members-1]);
+        printf("------------------------------------\n\n");
+    } else {
+        printf("%s at %d: EOVERFLOW=%d EINVAL=%d\n",__func__,__LINE__,EOVERFLOW,EINVAL);
+        printf("%s at %d: No members found for pool '%s' or an error occurred num_members=%d (%s).\n",
+               __func__,__LINE__,pool_name, num_members, strerror(errno));
+    }
+#endif
+
+err_out:
+    if (dd >= 0) close(dd);
+    if (layout != NULL) llapi_layout_free(layout);
+    if (dirc != NULL) NCI_Free(dirc);
+    if (buffer != NULL) NCI_Free(buffer);
+    if (members != NULL) NCI_Free(members);
+
+    return num_members;
+}
+
+static
+int compare(const void *a, const void *b)
+{
+     if (*(uint64_t*)a > *(uint64_t*)b) return (1);
+     if (*(uint64_t*)a < *(uint64_t*)b) return (-1);
+     return (0);
+}
+
+static
+int sort_ost_ids(struct llapi_layout *layout,
+                 uint64_t             stripe_count,
+                 uint64_t            *osts)
+{
+    uint64_t i, numOSTs;
+
+    for (i=0; i<stripe_count; i++) {
+        if (llapi_layout_ost_index_get(layout, i, &osts[i]) != 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+            fprintf(stderr,"Error at %s (%d) llapi_layout_ost_index_get(%lu) (%s)\n",
+                    __FILE__,__LINE__,i,strerror(errno));
+#endif
+            return stripe_count;
+        }
+    }
+
+    /* count the number of unique OST IDs. When Lustre overstriping is
+     * used, the unique OSTs may be less than stripe_count.
+     */
+    qsort(osts, stripe_count, sizeof(uint64_t), compare);
+    numOSTs = 0;
+    for (i=1; i<stripe_count; i++)
+        if (osts[i] > osts[numOSTs])
+            osts[++numOSTs] = osts[i];
+
+    return (numOSTs + 1);
+}
+
+/*----< get_striping() >-----------------------------------------------------*/
+static
+uint64_t get_striping(int         fd,
+                      const char *path,
+                      uint64_t   *pattern,
+                      uint64_t   *stripe_count,
+                      uint64_t   *stripe_size,
+                      uint64_t   *start_iodevice)
+{
+    int err;
+    struct llapi_layout *layout;
+    uint64_t *osts=NULL, numOSTs=0;
+#ifdef PNETCDF_LUSTRE_DEBUG
+    char int_str[32];
+#endif
+
+    *pattern = LLAPI_LAYOUT_RAID0;
+    *stripe_count = LLAPI_LAYOUT_DEFAULT;
+    *stripe_size = LLAPI_LAYOUT_DEFAULT;
+    *start_iodevice = LLAPI_LAYOUT_DEFAULT;
+
+    layout = llapi_layout_get_by_fd(fd, LLAPI_LAYOUT_GET_COPY);
+    if (layout == NULL) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        fprintf(stderr,"Error at %s (%d) llapi_layout_get_by_fd() fails\n",
+                __FILE__, __LINE__);
+#endif
+        goto err_out;
+    }
+
+    err = llapi_layout_pattern_get(layout, pattern);
+    if (err != 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        snprintf(int_str, 32, "%lu", *pattern);
+        fprintf(stderr,"Error at %s (%d) llapi_layout_pattern_get() fails to get patter %s\n",
+                __FILE__, __LINE__, PATTERN_STR(*pattern, int_str));
+#endif
+        goto err_out;
+    }
+
+    /* obtain file striping count */
+    err = llapi_layout_stripe_count_get(layout, stripe_count);
+    if (err != 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        snprintf(int_str, 32, "%lu", *stripe_count);
+        fprintf(stderr,"Error at %s (%d) llapi_layout_stripe_count_get() fails to get stripe count %s\n",
+            __FILE__, __LINE__, PATTERN_STR(*stripe_count, int_str));
+#endif
+        goto err_out;
+    }
+
+    /* obtain file striping unit size */
+    err = llapi_layout_stripe_size_get(layout, stripe_size);
+    if (err != 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        snprintf(int_str, 32, "%lu", *stripe_size);
+        fprintf(stderr,"Error at %s (%d) llapi_layout_stripe_size_get() fails to get stripe size %s\n",
+            __FILE__,__LINE__, PATTERN_STR(*stripe_size, int_str));
+#endif
+        goto err_out;
+    }
+
+    /* /usr/include/linux/lustre/lustre_user.h
+     * The stripe size fields are shared for the extension size storage,
+     * however the extension size is stored in KB, not bytes.
+     *     #define SEL_UNIT_SIZE 1024llu
+     * Therefore, the default stripe_size is (SEL_UNIT_SIZE * 1024)
+     */
+
+    if (*stripe_count == LLAPI_LAYOUT_DEFAULT ||  /* not set */
+        *stripe_count == LLAPI_LAYOUT_INVALID ||  /* invalid */
+        *stripe_count == LLAPI_LAYOUT_WIDE    ||  /* all system's OSTs */
+        *stripe_count > 1048576) {                /* abnormally large number */
+        return 0;
+    }
+
+    /* obtain all OST IDs */
+    osts = (uint64_t*) NCI_Malloc(sizeof(uint64_t) * (*stripe_count));
+    if (llapi_layout_ost_index_get(layout, 0, &osts[0]) != 0) {
+        /* check if is a folder */
+        struct stat path_stat;
+        fstat(fd, &path_stat);
+#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE
+        if (S_ISREG(path_stat.st_mode)) /* not a regular file */
+            printf("%s at %d: %s is a regular file\n",__func__,__LINE__,path);
+        else if (S_ISDIR(path_stat.st_mode))
+            printf("%s at %d: %s is a folder\n",__func__,__LINE__,path);
+        else
+#endif
+        if (!S_ISREG(path_stat.st_mode) && /* not a regular file */
+            !S_ISDIR(path_stat.st_mode)) { /* not a folder */
+#ifdef PNETCDF_LUSTRE_DEBUG
+            fprintf(stderr,"Error at %s (%d) calling fstat() file %s (neither a regular file nor a folder)\n", \
+                    __FILE__, __LINE__, path);
+#endif
+            goto err_out;
+        }
+
+        *start_iodevice = LLAPI_LAYOUT_DEFAULT;
+        numOSTs = *stripe_count;
+
+        goto err_out;
+    }
+    *start_iodevice = osts[0];
+
+    numOSTs = sort_ost_ids(layout, *stripe_count, osts);
+    assert(numOSTs <= *stripe_count);
+
+err_out:
+    if (osts != NULL) NCI_Free(osts);
+    if (layout != NULL) llapi_layout_free(layout);
+
+    return numOSTs;
+}
+
+/*----< set_striping() >-----------------------------------------------------*/
+static
+int set_striping(const char *path,
+                 uint64_t    pattern,
+                 uint64_t    numOSTs,
+                 uint64_t    stripe_count,
+                 uint64_t    stripe_size,
+                 uint64_t    start_iodevice)
+{
+    int fd=-1, err=0;
+
+    struct llapi_layout *layout = llapi_layout_alloc();
+    if (layout == NULL) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        fprintf(stderr,"Error at %s (%d) llapi_layout_alloc() fails (%s)\n",
+                __FILE__, __LINE__, strerror(errno));
+#endif
+        goto err_out;
+    }
+
+    /* When an abnormally large stripe_count is set by users, Lustre may just
+     * allocate the total number of available OSTs, instead of returning an
+     * error.
+     */
+    err = llapi_layout_stripe_count_set(layout, stripe_count);
+    if (err != 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        fprintf(stderr,"Error at %s (%d) llapi_layout_stripe_count_set() fails set stripe count %lu (%s)\n",
+                __FILE__, __LINE__, stripe_count, strerror(errno));
+#endif
+        goto err_out;
+    }
+
+    err = llapi_layout_stripe_size_set(layout, stripe_size);
+    if (err != 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        fprintf(stderr,"Error at %s (%d) llapi_layout_stripe_size_set() fails to set strpe size %lu (%s)\n",
+                __FILE__, __LINE__, stripe_size, strerror(errno));
+#endif
+        goto err_out;
+    }
+
+    if (pattern == LLAPI_LAYOUT_OVERSTRIPING) {
+        uint64_t i, ost_id;
+        if (start_iodevice == LLAPI_LAYOUT_DEFAULT)
+            start_iodevice = 0;
+        for (i=0; i<stripe_count; i++) {
+            ost_id = start_iodevice + (i % numOSTs);
+            err = llapi_layout_ost_index_set(layout, i, ost_id);
+            if (err != 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+                fprintf(stderr,"Error at %s (%d) llapi_layout_ost_index_set() fails to set OST index %lu to %lu (%s)\n",
+                        __FILE__, __LINE__, i, ost_id, strerror(errno));
+#endif
+                goto err_out;
+            }
+        }
+    }
+    else {
+        /* When an abnormally large start_iodevice is set by users, Lustre may
+         * return an error. Instead fail will occur later at calling
+         * llapi_layout_file_create().
+         */
+        err = llapi_layout_ost_index_set(layout, 0, start_iodevice);
+        if (err != 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+            fprintf(stderr,"Error at %s (%d) llapi_layout_ost_index_set() fails to set start iodevice %lu (%s)\n",
+                    __FILE__, __LINE__, start_iodevice, strerror(errno));
+#endif
+            goto err_out;
+        }
+    }
+
+    err = llapi_layout_pattern_set(layout, pattern);
+    if (err != 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        char int_str[32];
+        snprintf(int_str, 32, "%lu", pattern);
+        fprintf(stderr,"Error at %s (%d) llapi_layout_pattern_set() fails ito set pattern %s (%s)\n",
+                __FILE__, __LINE__, PATTERN_STR(pattern, int_str), strerror(errno));
+#endif
+        goto err_out;
+    }
+
+    /* create a new file with desired striping */
+    fd = llapi_layout_file_create(path, O_CREAT|O_RDWR, PNCIO_PERM, layout);
+    if (fd < 0) {
+#ifdef PNETCDF_LUSTRE_DEBUG
+        fprintf(stderr,"Error at %s (%d) llapi_layout_file_create() fails (%s)\n",
+                __FILE__, __LINE__, strerror(errno));
+#endif
+        goto err_out;
+    }
+
+err_out:
+    if (layout != NULL) llapi_layout_free(layout);
+
+#ifdef PNETCDF_LUSTRE_DEBUG
+    if (fd < 0)
+        fprintf(stderr,"Error at %s (%d) fails to create file %s with desired file striping. PnetCDF now tries to inherit it from the parent folder.\n",
+                __FILE__,__LINE__, path);
+#endif
+
+    return fd;
+}
+#endif
+
+/*----< Lustre_set_cb_node_list() >------------------------------------------*/
+/* Construct the list of I/O aggregators. It sets the followings.
+ *   fd->hints->cb_nodes and set file info for hint cb_nodes.
+ *   fd->hints->ranklist[], an int array of size fd->hints->cb_nodes.
+ *   fd->is_agg: indicating whether this rank is an I/O aggregator
+ *   fd->my_cb_nodes_index: index into fd->hints->ranklist[]. -1 if N/A
+ */
+static
+int Lustre_set_cb_node_list(PNCIO_File *fd)
+{
+    int i, j, k, rank, nprocs, num_aggr, striping_factor;
+    int *nprocs_per_node, **ranks_per_node;
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &rank);
+
+    /* number of MPI processes running on each node */
+    nprocs_per_node = (int *) NCI_Calloc(fd->num_nodes, sizeof(int));
+
+    for (i=0; i<nprocs; i++) nprocs_per_node[fd->node_ids[i]]++;
+
+    /* construct rank IDs of MPI processes running on each node */
+    ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->num_nodes);
+    ranks_per_node[0] = (int *) NCI_Malloc(sizeof(int) * nprocs);
+    for (i=1; i<fd->num_nodes; i++)
+        ranks_per_node[i] = ranks_per_node[i - 1] + nprocs_per_node[i - 1];
+
+    for (i=0; i<fd->num_nodes; i++) nprocs_per_node[i] = 0;
+
+    /* Populate ranks_per_node[], list of MPI ranks running on each node.
+     * Populate nprocs_per_node[], number of MPI processes on each node.
+     */
+    for (i=0; i<nprocs; i++) {
+        k = fd->node_ids[i];
+        ranks_per_node[k][nprocs_per_node[k]] = i;
+        nprocs_per_node[k]++;
+    }
+
+    /* To save a call to MPI_Bcast(), all processes run the same codes below to
+     * calculate num_aggr, the number of aggregators (later becomes cb_nodes).
+     *
+     * The calculation is based on the number of compute nodes, fd->num_nodes,
+     * and processes per node, nprocs_per_node.  At this moment, all processes
+     * should have obtained the Lustre file striping settings.
+     */
+    striping_factor = fd->hints->striping_factor;
+
+    if (striping_factor > nprocs) {
+        /* When number of MPI processes is less than striping_factor, set
+         * num_aggr to the max number less than nprocs that divides
+         * striping_factor. An naive way is:
+         *     num_aggr = nprocs;
+         *     while (striping_factor % num_aggr > 0)
+         *         num_aggr--;
+         * Below is equivalent, but faster.
+         */
+        int divisor = 2;
+        num_aggr = 1;
+        /* try to divide */
+        while (striping_factor >= divisor * divisor) {
+            if ((striping_factor % divisor) == 0) {
+                if (striping_factor / divisor <= nprocs) {
+                    /* The value is found ! */
+                    num_aggr = striping_factor / divisor;
+                    break;
+                }
+                /* if divisor is less than nprocs, divisor is a solution,
+                 * but it is not sure that it is the best one
+                 */
+                else if (divisor <= nprocs)
+                    num_aggr = divisor;
+            }
+            divisor++;
+        }
+    }
+    else { /* striping_factor <= nprocs */
+        /* Select striping_factor processes to be I/O aggregators. Note this
+         * also applies to collective reads to allow more/less aggregators. In
+         * most cases, more aggregators yields better read performance.
+         */
+        if (fd->hints->cb_nodes == 0) {
+            /* User did not set hint "cb_nodes" */
+            if (nprocs >= striping_factor * 8 && nprocs/fd->num_nodes >= 8)
+                num_aggr = striping_factor * 8;
+            else if (nprocs >= striping_factor * 4 && nprocs/fd->num_nodes >= 4)
+                num_aggr = striping_factor * 4;
+            else if (nprocs >= striping_factor * 2 && nprocs/fd->num_nodes >= 2)
+                num_aggr = striping_factor * 2;
+            else
+                num_aggr = striping_factor;
+        }
+        else if (fd->hints->cb_nodes <= striping_factor) {
+            /* User has set hint cb_nodes and cb_nodes <= striping_factor.
+             * Ignore user's hint and try to set cb_nodes to be at least
+             * striping_factor.
+             */
+            num_aggr = striping_factor;
+        }
+        else {
+            /* User has set hint cb_nodes and cb_nodes > striping_factor */
+            if (nprocs < fd->hints->cb_nodes)
+                num_aggr = nprocs; /* BAD cb_nodes set by users */
+            else
+                num_aggr = fd->hints->cb_nodes;
+        }
+
+        /* Number of processes per node may not be enough to be picked as
+         * aggregators. If this case, reduce num_aggr (cb_nodes). Consider the
+         * following case:
+         *   number of nodes = 7,
+         *   number of processes = 18,
+         *   striping_factor = 8,
+         *   cb_nodes = 16.
+         * Nodes in this case, nodes 0, 1, 2, 3 run 3 processes each and nodes
+         * 4, 5, 6 run 2 processes each. In order to keep each OST only
+         * accessed by one or more aggregators running on the same compute
+         * node, cb_nodes should be reduced to 8. Thus the ranks of aggregators
+         * become 0, 3, 6, 9, 12, 14, 16, 1. The aggregator-OST mapping
+         * becomes below.
+         *   Aggregator  0, running on node 0, access OST 0.
+         *   Aggregator  3, running on node 1, access OST 1.
+         *   Aggregator  6, running on node 2, access OST 2.
+         *   Aggregator  9, running on node 3, access OST 3.
+         *   Aggregator 12, running on node 4, access OST 4.
+         *   Aggregator 14, running on node 5, access OST 5.
+         *   Aggregator 16, running on node 6, access OST 6.
+         *   Aggregator  1, running on node 0, access OST 7.
+         *
+         * Another case (the total number of processes changes to 25):
+         *   number of nodes = 7,
+         *   number of processes = 25,
+         *   striping_factor = 8,
+         *   cb_nodes = 16.
+         * In this case, nodes 0, 1, 2, 3 run 4 processes each and nodes 4, 5,
+         * 6 run 3 processes each. cb_nodes should remain 16 and the ranks of
+         * aggregators become 0, 4, 8, 12, 16, 19, 22, 1, 2, 6, 10, 14, 18, 21,
+         * 24, 3. The aggregator-OST mapping becomes below.
+         *   Aggregators  0,  2, running on node 0, access OST 0.
+         *   Aggregators  4,  6, running on node 1, access OST 1.
+         *   Aggregators  8, 10, running on node 2, access OST 2.
+         *   Aggregators 12, 14, running on node 3, access OST 3.
+         *   Aggregators 16, 18, running on node 4, access OST 4.
+         *   Aggregators 19, 21, running on node 5, access OST 5.
+         *   Aggregators 22, 24, running on node 6, access OST 6.
+         *   Aggregator   3,     running on node 0, access OST 7.
+         */
+        int max_nprocs_node = 0;
+        for (i=0; i<fd->num_nodes; i++)
+            max_nprocs_node = MAX(max_nprocs_node, nprocs_per_node[i]);
+        int max_naggr_node = striping_factor / fd->num_nodes;
+        if (striping_factor % fd->num_nodes) max_naggr_node++;
+        /* max_naggr_node is the max number of processes per node to be picked
+         * as aggregator in each round.
+         */
+        int rounds = num_aggr / striping_factor;
+        if (num_aggr % striping_factor) rounds++;
+        while (max_naggr_node * rounds > max_nprocs_node) rounds--;
+        num_aggr = striping_factor * rounds;
+    }
+
+    /* TODO: the above setting for num_aggr is for collective writes. Should
+     * collective reads use the same?  Or just set cb_nodes to the number of
+     * nodes.
+     */
+
+    /* Next step is to determine the MPI rank IDs of I/O aggregators and add
+     * them into ranklist[]. Note fd->hints->ranklist will be freed in
+     * PNCIO_File_close().
+     */
+    fd->hints->ranklist = (int *) NCI_Malloc(num_aggr * sizeof(int));
+    if (fd->hints->ranklist == NULL)
+        return NC_ENOMEM;
+
+    int block_assignment=0;
+#ifdef TRY_AGGR_BLOCK_ASSIGNMENT
+    {
+        char *env_str;
+        if ((env_str = getenv("PNETCDF_USE_BLOCK_ASSIGN")) != NULL)
+            block_assignment = (strcasecmp(env_str, "true") == 0) ? 1 : 0;
+        if (rank == 0)
+            printf("%s %d: PNETCDF_USE_BLOCK_ASSIGN = %d\n",
+            __func__,__LINE__,block_assignment);
+    }
+#endif
+
+    if (striping_factor <= fd->num_nodes) {
+        /* When number of OSTs is less than number of compute nodes, first
+         * select number of nodes equal to the number of OSTs by spread the
+         * selection evenly across all compute nodes (i.e. with a stride
+         * between every 2 consecutive nodes).
+         * Selection of MPI ranks can be done in 2 ways.
+         * 1. block assignment
+         *    Select ranks from a node and then move on to the next node.
+         * 2. cyclic assignment
+         *    Select ranks round-robin across all selected nodes.
+         * Note when selecting ranks within a node, the ranks are evenly spread
+         * among all processes in the node.
+         */
+        if (block_assignment) {
+            int n=0;
+            int remain = num_aggr % striping_factor;
+            int node_stride = fd->num_nodes / striping_factor;
+            /* walk through each node and pick aggregators */
+            for (j=0; j<fd->num_nodes; j+=node_stride) {
+                /* Selecting node IDs with a stride. j is the node ID */
+                int nranks_per_node = num_aggr / striping_factor;
+                /* front nodes may have 1 more to pick */
+                if (remain > 0 && j/node_stride < remain) nranks_per_node++;
+                int rank_stride = nprocs_per_node[j] / nranks_per_node;
+                for (k=0; k<nranks_per_node; k++) {
+                    /* Selecting rank IDs within node j with a stride */
+                    fd->hints->ranklist[n] = ranks_per_node[j][k*rank_stride];
+                    if (++n == num_aggr) {
+                        j = fd->num_nodes; /* break loop j */
+                        break; /* loop k */
+                    }
+                }
+            }
+        }
+        else {
+            int avg = num_aggr / striping_factor;
+            int stride = fd->num_nodes / striping_factor;
+            if (num_aggr % striping_factor) avg++;
+            for (i = 0; i < num_aggr; i++) {
+                /* j is the selected node ID. This selection is round-robin
+                 * across selected nodes.
+                 */
+                j = (i % striping_factor) * stride;
+                k = (i / striping_factor) * (nprocs_per_node[j] / avg);
+                assert(k < nprocs_per_node[j]);
+                fd->hints->ranklist[i] = ranks_per_node[j][k];
+            }
+        }
+    }
+    else { /* striping_factor > fd->num_nodes */
+        /* When number of OSTs is more than number of compute nodes, I/O
+         * aggregators are selected from all nodes. Within each node,
+         * aggregators are spread evenly instead of the first few ranks.
+         */
+        int *naggr_per_node, *idx_per_node, avg;
+        idx_per_node = (int*) NCI_Calloc(fd->num_nodes, sizeof(int));
+        naggr_per_node = (int*) NCI_Malloc(fd->num_nodes * sizeof(int));
+        for (i = 0; i < striping_factor % fd->num_nodes; i++)
+            naggr_per_node[i] = striping_factor / fd->num_nodes + 1;
+        for (; i < fd->num_nodes; i++)
+            naggr_per_node[i] = striping_factor / fd->num_nodes;
+        avg = num_aggr / striping_factor;
+        if (avg > 0)
+            for (i = 0; i < fd->num_nodes; i++)
+                naggr_per_node[i] *= avg;
+        for (i = 0; i < fd->num_nodes; i++)
+            naggr_per_node[i] = MIN(naggr_per_node[i], nprocs_per_node[i]);
+        /* naggr_per_node[] is the number of aggregators that can be
+         * selected as I/O aggregators
+         */
+
+        if (block_assignment) {
+            int n = 0;
+            for (j=0; j<fd->num_nodes; j++) {
+                /* j is the node ID */
+                int rank_stride = nprocs_per_node[j] / naggr_per_node[j];
+                /* try stride==1 seems no effect, rank_stride = 1; */
+                for (k=0; k<naggr_per_node[j]; k++) {
+                    fd->hints->ranklist[n] = ranks_per_node[j][k*rank_stride];
+                    if (++n == num_aggr) {
+                        j = fd->num_nodes; /* break loop j */
+                        break; /* loop k */
+                    }
+                }
+            }
+        }
+        else {
+            for (i = 0; i < num_aggr; i++) {
+                int stripe_i = i % striping_factor;
+                j = stripe_i % fd->num_nodes; /* to select from node j */
+                k = nprocs_per_node[j] / naggr_per_node[j];
+                k *= idx_per_node[j];
+                /* try stride==1 seems no effect, k = idx_per_node[j]; */
+                idx_per_node[j]++;
+                assert(k < nprocs_per_node[j]);
+                fd->hints->ranklist[i] = ranks_per_node[j][k];
+            }
+        }
+        NCI_Free(naggr_per_node);
+        NCI_Free(idx_per_node);
+    }
+
+    /* TODO: we can keep these two arrays in case for dynamic construction
+     * of fd->hints->ranklist[], such as in group-cyclic file domain
+     * assignment method, used in each collective write call.
+     */
+    NCI_Free(nprocs_per_node);
+    NCI_Free(ranks_per_node[0]);
+    NCI_Free(ranks_per_node);
+
+    /* set file striping hints */
+    fd->hints->cb_nodes = num_aggr;
+
+    /* check whether this process is selected as an I/O aggregator */
+    fd->is_agg = 0;
+    fd->my_cb_nodes_index = -1;
+    for (i = 0; i < num_aggr; i++) {
+        if (rank == fd->hints->ranklist[i]) {
+            fd->is_agg = 1;
+            fd->my_cb_nodes_index = i;
+            break;
+        }
+    }
+
+    return 0;
+}
+
+/*----< PNCIO_Lustre_create() >----------------------------------------------*/
+/*   1. root creates the file
+ *   2. root sets and obtains striping info
+ *   3. root broadcasts striping info
+ *   4. non-root processes receive striping info from root
+ *   5. non-root processes opens the fie
+ */
+int
+PNCIO_Lustre_create(PNCIO_File *fd,
+                    int         mpi_io_mode)
+{
+    char int_str[16];
+    int err=NC_NOERR, rank, perm, old_mask;
+    int stripin_info[4] = {-1, -1, -1, -1};
+#ifdef HAVE_LUSTRE
+    int total_num_OSTs;
+    uint64_t numOSTs, pattern, stripe_count, stripe_size, start_iodevice;
+#endif
+
+#ifdef WKL_DEBUG
+extern int first_ost_id;
+first_ost_id = -1;
+#endif
+
+    MPI_Comm_rank(fd->comm, &rank);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+static int wkl=0; if (wkl == 0 && rank == 0) { printf("\nxxxx %s at %d: %s ---- %s\n",__func__,__LINE__,(fd->file_system == PNCIO_LUSTRE)?"PNCIO_LUSTRE":"PNCIO_UFS",fd->filename); wkl++; fflush(stdout);}
+#endif
+
+#if defined(HAVE_LUSTRE) || defined(MIMIC_LUSTRE)
+assert(mpi_io_mode & MPI_MODE_CREATE);
+
+/* Note ncmpi_create always creates a file with readable and writable permission. */
+    int amode = O_CREAT;
+    if (mpi_io_mode & MPI_MODE_RDWR) amode |= O_RDWR;
+#endif
+
+    old_mask = umask(022);
+    umask(old_mask);
+    perm = old_mask ^ PNCIO_PERM;
+
+    /* root process creates the file first, followed by all processes open the
+     * file.
+     */
+    if (rank > 0) goto err_out;
+
+    /* For Lustre, we need to obtain file striping info (striping_factor,
+     * striping_unit, and num_osts) in order to select the I/O aggregators
+     * in fd->hints->ranklist, no matter its is open or create mode.
+     */
+
+#ifdef HAVE_LUSTRE
+    int overstriping_ratio, str_factor, str_unit, start_iodev;
+
+    /* In a call to PNCIO_File_SetInfo() earlier, hints have been validated to
+     * be consistent among all processes.
+     */
+
+    str_unit           = fd->hints->striping_unit;
+    str_factor         = fd->hints->striping_factor;
+    start_iodev        = fd->hints->start_iodevice;
+    overstriping_ratio = fd->hints->fs_hints.lustre.overstriping_ratio;
+
+    /* obtain the total number of OSTs available */
+    total_num_OSTs = get_total_avail_osts(fd->filename);
+    if (total_num_OSTs <= 0) /* failed to obtain number of available OSTs */
+        total_num_OSTs = PNCIO_LUSTRE_MAX_OSTS;
+    if (str_factor > total_num_OSTs)
+        str_factor = total_num_OSTs;
+
+    numOSTs=0;
+    pattern = LLAPI_LAYOUT_DEFAULT;
+    stripe_count = LLAPI_LAYOUT_DEFAULT;
+    stripe_size = LLAPI_LAYOUT_DEFAULT;
+    start_iodevice = LLAPI_LAYOUT_DEFAULT;
+
+    fd->fd_sys = -1;
+
+    /* When no file striping hint is set, their default values are:
+     * fd->hints->striping_factor = 0;
+     * fd->hints->striping_unit = 0;
+     * fd->hints->start_iodevice = -1;
+     * fd->hints->fs_hints.lustre.overstriping_ratio = 1;
+     */
+
+    /* In many cases, the Lustre striping configuration of the file to be
+     * created is not explicitly set by the users (through I/O hints
+     * striping_factor and striping_unit) or the striping configuration of
+     * parent folder to store the new file is not explicitly set by the users.
+     *
+     * Here, if application did not set the file striping hints, we set the new
+     * file's striping count to be equal to the number of compute nodes
+     * allocated to fd->comm and the striping size to 1 MiB. Inheriting the
+     * striping from the parent folder is disabled. But if inheritance is
+     * desired, this can be changed by defining macro INHERIT_DIR_STRIPING
+     * which enables the code block below.
+     *
+     * Note if the application explicitly set hints striping_factor and
+     * striping_unit, then they take precedence over the default.
+     */
+#ifdef INHERIT_DIR_STRIPING
+    /* Inherit the file striping settings of the folder. */
+
+    if (str_factor == 0 || str_unit == 0 ||
+        (overstriping_ratio > 1 && start_iodev < 0)) {
+        /* When not all of the striping parameters are set by users, inherit
+         * those missing ones from the folder.
+         */
+        int dd;
+        char *dirc, *dname;
+        dirc = NCI_Strdup(fd->filename);
+        dname = dirname(dirc);
+
+        dd = open(dname, O_RDONLY, PNCIO_PERM);
+
+        numOSTs = get_striping(dd, dname, &pattern,
+                                   &stripe_count,
+                                   &stripe_size,
+                                   &start_iodevice);
+        close(dd);
+        NCI_Free(dirc);
+
+#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE
+        printf("line %d: use parent folder's striping to set file's:\n",__LINE__);
+        PRINT_LAYOUT(numOSTs);
+        PRINT_LAYOUT(stripe_count);
+        PRINT_LAYOUT(stripe_size);
+        PRINT_LAYOUT(start_iodevice);
+        PRINT_LAYOUT(pattern);
+#endif
+        /* in case of default striping setting is used */
+        if (numOSTs == 0) numOSTs = 1;
+    }
+#endif
+
+    /* If hint striping_factor is not set by the user and the new file's folder
+     * has not set its striping parameters, then we set the number of unique
+     * OSTs, numOSTs, to the number of compute nodes allocated to this job,
+     * which sets stripe_count to (numOSTs * overstriping_ratio).
+     */
+    if (str_factor == 0 && (stripe_count == LLAPI_LAYOUT_DEFAULT ||
+                            stripe_count == LLAPI_LAYOUT_WIDE)) {
+        stripe_count = MIN(fd->num_nodes, total_num_OSTs);
+        if (overstriping_ratio > 1) stripe_count *= overstriping_ratio;
+    }
+    else if (str_factor > 0)
+        stripe_count = str_factor;
+
+    /* When overstriping is requested by the user, calculate the number of
+     * unique OSTs.
+     */
+    if (overstriping_ratio > 1) {
+        pattern = LLAPI_LAYOUT_OVERSTRIPING;
+        if (stripe_count < overstriping_ratio)
+            numOSTs = 1;
+        else
+            numOSTs = stripe_count / overstriping_ratio;
+    }
+    /* If ill values are detected, fall back to no overstriping */
+    if (overstriping_ratio <= 1 || numOSTs == stripe_count) {
+        numOSTs = stripe_count;
+        pattern = LLAPI_LAYOUT_RAID0;
+    }
+
+    /* If user has not set hint striping_unit and the folder's striping size is
+     * also not set, then use the default.
+     */
+    if (str_unit == 0 && stripe_size == LLAPI_LAYOUT_DEFAULT)
+        stripe_size = LLAPI_LAYOUT_DEFAULT;
+    else if (str_unit > 0)
+        stripe_size = str_unit;
+
+    /* If user has not set hint start_iodevice and the folder's start_iodevice
+     * is also not set, then use the default.
+     */
+    if (start_iodev == -1 && start_iodevice == LLAPI_LAYOUT_DEFAULT)
+        start_iodevice = LLAPI_LAYOUT_DEFAULT;
+    else if (start_iodev > 0)
+        start_iodevice = start_iodev;
+
+#ifdef PNETCDF_LUSTRE_DEBUG_VERBOSE
+    printf("\n\tAfter adjust striping parameters become:\n");
+    PRINT_LAYOUT(numOSTs);
+    PRINT_LAYOUT(stripe_count);
+    PRINT_LAYOUT(stripe_size);
+    PRINT_LAYOUT(start_iodevice);
+    PRINT_LAYOUT(pattern);
+#endif
+
+    /* create a new file and set striping */
+    fd->fd_sys = set_striping(fd->filename, pattern,
+                                            numOSTs,
+                                            stripe_count,
+                                            stripe_size,
+                                            start_iodevice);
+
+    if (fd->fd_sys < 0)
+        /* If explicitly setting file striping failed, inherit the striping
+         * from the folder by simply creating the file.
+         */
+        fd->fd_sys = open(fd->filename, amode, perm);
+
+    if (fd->fd_sys < 0) {
+        fprintf(stderr,"Error at %s (%d) fails to create file %s (%s)\n",
+                __FILE__,__LINE__, fd->filename, strerror(errno));
+        err = ncmpii_error_posix2nc("Lustre set striping");
+        goto err_out;
+    }
+
+    /* Obtain Lustre file striping parameters actually set. */
+    numOSTs = get_striping(fd->fd_sys, fd->filename, &pattern,
+                                       &stripe_count,
+                                       &stripe_size,
+                                       &start_iodevice);
+
+    stripin_info[0] = stripe_size;
+    stripin_info[1] = stripe_count;
+    stripin_info[2] = start_iodevice;
+    stripin_info[3] = numOSTs;
+
+#elif defined(MIMIC_LUSTRE)
+    fd->fd_sys = open(fd->filename, amode, perm);
+    if (fd->fd_sys == -1) {
+        fprintf(stderr,"%s line %d: rank %d fails to create file %s (%s)\n",
+                __FILE__,__LINE__, rank, fd->filename, strerror(errno));
+        err = ncmpii_error_posix2nc("open");
+        goto err_out;
+    }
+
+    char *env_str = getenv("MIMIC_STRIPE_SIZE");
+    if (env_str != NULL)
+        stripin_info[0] = atoi(env_str);
+    else
+        stripin_info[0] = STRIPE_SIZE;
+    stripin_info[1] = STRIPE_COUNT;
+    stripin_info[2] = 0;
+    stripin_info[3] = STRIPE_COUNT;
+#endif
+
+err_out:
+    MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm);
+    if (fd->file_system == PNCIO_LUSTRE &&
+        (stripin_info[0] == -1 || stripin_info[3] == 0)) {
+        fprintf(stderr, "%s line %d: failed to create Lustre file %s\n",
+                __FILE__, __LINE__, fd->filename);
+        return err;
+    }
+
+    fd->hints->striping_unit   = stripin_info[0];
+    fd->hints->striping_factor = stripin_info[1];
+    fd->hints->start_iodevice  = stripin_info[2];
+    if (fd->file_system == PNCIO_LUSTRE) {
+        fd->hints->fs_hints.lustre.num_osts = stripin_info[3];
+        fd->hints->fs_hints.lustre.overstriping_ratio = stripin_info[1] / stripin_info[3];
+    }
+
+    if (rank > 0) { /* non-root processes */
+        fd->fd_sys = open(fd->filename, O_RDWR, perm);
+        if (fd->fd_sys == -1) {
+            fprintf(stderr,"%s line %d: rank %d failure to open file %s (%s)\n",
+                    __FILE__,__LINE__, rank, fd->filename, strerror(errno));
+            return ncmpii_error_posix2nc("ioctl");
+        }
+    }
+
+    /* construct cb_nodes rank list */
+    Lustre_set_cb_node_list(fd);
+
+    MPI_Info_set(fd->info, "romio_filesystem_type", "LUSTRE:");
+
+    snprintf(int_str, 16, "%d", fd->hints->fs_hints.lustre.num_osts);
+    MPI_Info_set(fd->info, "lustre_num_osts", int_str);
+
+    snprintf(int_str, 16, "%d", fd->hints->fs_hints.lustre.overstriping_ratio);
+    MPI_Info_set(fd->info, "lustre_overstriping_ratio", int_str);
+
+    return err;
+}
+
+/*----< PNCIO_Lustre_open() >------------------------------------------------*/
+/*   1. all processes open the file.
+ *   2. root obtains striping info and broadcasts to all others
+ */
+int
+PNCIO_Lustre_open(PNCIO_File *fd)
+{
+    char int_str[16];
+    int err=NC_NOERR, rank, perm, old_mask;
+    int stripin_info[4] = {1048576, -1, -1, -1};
+
+#ifdef WKL_DEBUG
+extern int first_ost_id;
+first_ost_id = -1;
+#endif
+
+    MPI_Comm_rank(fd->comm, &rank);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+static int wkl=0; if (wkl == 0 && rank == 0) { printf("\nxxxx %s at %d: %s ---- %s\n",__func__,__LINE__,(fd->file_system == PNCIO_LUSTRE)?"PNCIO_LUSTRE":"PNCIO_UFS",fd->filename); wkl++; fflush(stdout);}
+#endif
+
+    old_mask = umask(022);
+    umask(old_mask);
+    perm = old_mask ^ PNCIO_PERM;
+
+    int omode = (fd->access_mode & MPI_MODE_RDWR) ? O_RDWR : O_RDONLY;
+
+    /* All processes open the file. */
+    fd->fd_sys = open(fd->filename, omode, perm);
+    if (fd->fd_sys == -1) {
+        fprintf(stderr, "%s line %d: rank %d fails to open file %s (%s)\n",
+                __FILE__,__LINE__, rank, fd->filename, strerror(errno));
+        err = ncmpii_error_posix2nc("open");
+        goto err_out;
+    }
+
+    /* Only root obtains the striping information and bcast to all other
+     * processes.
+     */
+    if (rank == 0) {
+#ifdef HAVE_LUSTRE
+        uint64_t numOSTs=0;
+        uint64_t pattern = LLAPI_LAYOUT_DEFAULT;
+        uint64_t stripe_count = LLAPI_LAYOUT_DEFAULT;
+        uint64_t stripe_size = LLAPI_LAYOUT_DEFAULT;
+        uint64_t start_iodevice = LLAPI_LAYOUT_DEFAULT;
+
+        numOSTs = get_striping(fd->fd_sys, fd->filename, &pattern,
+                                           &stripe_count,
+                                           &stripe_size,
+                                           &start_iodevice);
+
+        stripin_info[0] = stripe_size;
+        stripin_info[1] = stripe_count;
+        stripin_info[2] = start_iodevice;
+        stripin_info[3] = numOSTs;
+
+#elif defined(MIMIC_LUSTRE)
+        char *env_str = getenv("MIMIC_STRIPE_SIZE");
+        if (env_str != NULL)
+            stripin_info[0] = atoi(env_str);
+        else
+            stripin_info[0] = STRIPE_SIZE;
+        stripin_info[1] = STRIPE_COUNT;
+        stripin_info[2] = 0;
+        stripin_info[3] = STRIPE_COUNT;
+#endif
+    }
+
+err_out:
+    MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm);
+    fd->hints->striping_unit   = stripin_info[0];
+    fd->hints->striping_factor = stripin_info[1];
+    fd->hints->start_iodevice  = stripin_info[2];
+    fd->hints->fs_hints.lustre.num_osts = stripin_info[3];
+    fd->hints->fs_hints.lustre.overstriping_ratio = stripin_info[1] / stripin_info[3];
+
+    /* construct cb_nodes rank list */
+    Lustre_set_cb_node_list(fd);
+
+    MPI_Info_set(fd->info, "romio_filesystem_type", "LUSTRE:");
+
+    snprintf(int_str, 16, "%d", fd->hints->fs_hints.lustre.num_osts);
+    MPI_Info_set(fd->info, "lustre_num_osts", int_str);
+
+    snprintf(int_str, 16, "%d", fd->hints->fs_hints.lustre.overstriping_ratio);
+    MPI_Info_set(fd->info, "lustre_overstriping_ratio", int_str);
+
+    return err;
+}
+
diff --git a/src/drivers/pncio/pncio_lustre_wrcoll.c b/src/drivers/pncio/pncio_lustre_wrcoll.c
new file mode 100644
index 000000000..03b0a59e9
--- /dev/null
+++ b/src/drivers/pncio/pncio_lustre_wrcoll.c
@@ -0,0 +1,2389 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <pncio.h>
+
+static int use_alltoallw;
+
+#ifdef HAVE_MPI_LARGE_COUNT
+#define MEMCPY_UNPACK(x, inbuf, start, count, outbuf) {          \
+    int _k;                                                      \
+    char *_ptr = (inbuf);                                        \
+    MPI_Count   *mem_ptrs = others_req[x].mem_ptrs + (start);    \
+    MPI_Offset *mem_lens = others_req[x].lens     + (start);    \
+    for (_k=0; _k<count; _k++) {                                 \
+        memcpy((outbuf) + mem_ptrs[_k], _ptr, mem_lens[_k]);     \
+        _ptr += mem_lens[_k];                                    \
+    }                                                            \
+}
+#else
+#define MEMCPY_UNPACK(x, inbuf, start, count, outbuf) {          \
+    int _k;                                                      \
+    char *_ptr = (inbuf);                                        \
+    MPI_Aint *mem_ptrs = others_req[x].mem_ptrs + (start);       \
+    int      *mem_lens = others_req[x].lens     + (start);       \
+    for (_k=0; _k<count; _k++) {                                 \
+        memcpy((outbuf) + mem_ptrs[_k], _ptr, mem_lens[_k]);     \
+        _ptr += mem_lens[_k];                                    \
+    }                                                            \
+}
+#endif
+
+typedef struct {
+    MPI_Count    num; /* number of elements in the above off-len list */
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Offset *off; /* list of write offsets by this rank in round m */
+    MPI_Count   *len; /* list of write lengths by this rank in round m */
+#else
+    MPI_Offset  *off; /* list of write offsets by this rank in round m */
+    int         *len; /* list of write lengths by this rank in round m */
+#endif
+} off_len_list;
+
+typedef struct {
+    MPI_Count   count; /* number displacement-length pairs */
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Count  *disp;  /* [count]: displacement */
+    MPI_Count  *len;   /* [count]: size in bytes */
+#else
+    MPI_Aint   *disp;  /* [count]: displacement */
+    int        *len;   /* [count]: size in bytes */
+#endif
+} disp_len_list;
+
+/* prototypes of functions used for collective writes only. */
+static MPI_Offset LUSTRE_Exch_and_write(PNCIO_File *fd,
+                                        const void *buf,
+                                        PNCIO_View  buf_view,
+                                        PNCIO_Access *others_req,
+                                        PNCIO_Access *my_req,
+                                        MPI_Offset min_st_loc,
+                                        MPI_Offset max_end_loc,
+                                        MPI_Offset **buf_idx);
+
+static void LUSTRE_Fill_send_buffer(PNCIO_File *fd, const void *buf,
+                                    PNCIO_View *buf_view,
+                                    char **send_buf,
+                                    size_t send_total_size,
+                                    const MPI_Count *send_size,
+                                    char **self_buf,
+                                    disp_len_list *send_list);
+
+static int Exchange_data_recv(PNCIO_File           *fd,
+                              const void           *buf,
+                                    char           *write_buf,
+                                    char          **recv_buf,
+                              const PNCIO_View     *buf_view,
+                              const MPI_Count      *recv_size,
+                                    MPI_Offset      range_off,
+                                    MPI_Count       range_size,
+                              const MPI_Count      *recv_count,
+                              const MPI_Count      *start_pos,
+                              const PNCIO_Access   *others_req,
+                              const MPI_Offset     *buf_idx,
+                                    off_len_list   *srt_off_len,
+                                    disp_len_list  *recv_list);
+
+static void Exchange_data_send(      PNCIO_File     *fd,
+                               const void           *buf,
+                                     char           *write_buf,
+                                     char          **send_buf_ptr,
+                                     PNCIO_View     *buf_view,
+                               const MPI_Count      *send_size,
+                                     MPI_Count       self_count,
+                                     MPI_Count       start_pos,
+                               const PNCIO_Access   *others_req,
+                               const MPI_Offset     *buf_idx,
+                                     disp_len_list  *send_list);
+
+static
+int LUSTRE_Calc_aggregator(PNCIO_File *fd,
+                           MPI_Offset off,
+#ifdef HAVE_MPI_LARGE_COUNT
+                           MPI_Offset *len
+#else
+                           int        *len
+#endif
+)
+{
+    MPI_Offset avail_bytes, stripe_id;
+
+    stripe_id = off / fd->hints->striping_unit;
+
+    avail_bytes = (stripe_id + 1) * fd->hints->striping_unit - off;
+    if (avail_bytes < *len) {
+        /* The request [off, off+len) has only [off, off+avail_bytes) part
+         * falling into aggregator's file domain */
+        *len = avail_bytes;
+    }
+    /* return the index to ranklist[] */
+    return (stripe_id % fd->hints->cb_nodes);
+}
+
+/*----< LUSTRE_Calc_my_req() >-----------------------------------------------*/
+/* calculates what portions of the read/write requests of this process fall
+ * into the file domains of all I/O aggregators.
+ *   IN: fd->flat_file: this rank's flattened write requests
+ *       fd->flat_file.count: number of noncontiguous offset-length file requests
+ *       fd->flat_file.off[fd->flat_file.count] file offsets of individual
+ *       noncontiguous requests.
+ *       fd->flat_file.len[fd->flat_file.count] lengths of individual
+ *       noncontiguous requests.
+ *   IN: buf_is_contig: whether the write buffer is contiguous or not
+ *   OUT: my_req_ptr[cb_nodes] offset-length pairs of this process's requests
+ *        fall into the file domain of each aggregator
+ *   OUT: buf_idx_ptr[cb_nodes] index pointing to the starting location in
+ *        user_buf for data to be sent to each aggregator.
+ */
+static
+void LUSTRE_Calc_my_req(PNCIO_File    *fd,
+                        int            buf_is_contig,
+                        PNCIO_Access **my_req_ptr,
+                        MPI_Offset   **buf_idx)
+{
+    int aggr, *aggr_ranks, cb_nodes;
+    MPI_Count i, l;
+    size_t nelems, alloc_sz;
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Offset rem_len, avail_len, *avail_lens;
+#else
+    int rem_len, avail_len, *avail_lens;
+#endif
+    MPI_Offset curr_idx, off;
+    PNCIO_Access *my_req;
+
+    cb_nodes = fd->hints->cb_nodes;
+
+    /* my_req[i].count gives the number of contiguous requests of this process
+     * that fall in aggregator i's file domain (not process MPI rank i).
+     */
+    my_req = (PNCIO_Access *) NCI_Calloc(cb_nodes, sizeof(PNCIO_Access));
+    *my_req_ptr = my_req;
+
+    /* First pass is just to calculate how much space is needed to allocate
+     * my_req.
+     */
+#ifdef HAVE_MPI_LARGE_COUNT
+    alloc_sz = sizeof(int) + sizeof(MPI_Offset);
+    aggr_ranks = (int*) NCI_Malloc(alloc_sz * fd->flat_file.count);
+    avail_lens = (MPI_Offset*) (aggr_ranks + fd->flat_file.count);
+#else
+    alloc_sz = sizeof(int) * 2;
+    aggr_ranks = (int*) NCI_Malloc(alloc_sz * fd->flat_file.count);
+    avail_lens = aggr_ranks + fd->flat_file.count;
+#endif
+
+    /* Note that MPI standard (MPI 3.1 Chapter 13.1.1 and MPI 4.0 Chapter
+     * 14.1.1) requires that the typemap displacements of etype and
+     * filetype are non-negative and monotonically non-decreasing. This
+     * makes fd->flat_file.off[] to be monotonically non-decreasing.
+     */
+
+/*
+Alternative: especially for when fd->flat_file.count is large
+1 This rank's aggregate file access region is from start_offset to end_offset.
+2 start with the 1st aggregator ID and keep assign aggregator until next stripe.
+  This can avoid too many calls to LUSTRE_Calc_aggregator()
+*/
+
+    /* nelems will be the number of offset-length pairs for my_req[] */
+    nelems = 0;
+    for (i = 0; i < fd->flat_file.count; i++) {
+        /* short circuit offset/len processing if zero-byte read/write. */
+        if (fd->flat_file.len[i] == 0)
+            continue;
+
+        off = fd->flat_file.off[i];
+        avail_len = fd->flat_file.len[i];
+        /* LUSTRE_Calc_aggregator() modifies the value of 'avail_len' to the
+         * amount that is only covered by the aggr's file domain. The remaining
+         * (tail) will continue to be processed to determine to whose file
+         * domain it belongs. As LUSTRE_Calc_aggregator() can be expensive for
+         * large value of fd->flat_file.count, we keep a copy of the returned
+         * values of 'aggr' and 'avail_len' in aggr_ranks[] and avail_lens[] to
+         * be used in the next for loop (not next iteration).
+         *
+         * Note the returned value in 'aggr' is the index to ranklist[], i.e.
+         * the 'aggr'th element of array ranklist[], rather than the
+         * aggregator's MPI rank ID in fd->comm.
+         */
+        aggr = LUSTRE_Calc_aggregator(fd, off, &avail_len);
+        aggr_ranks[i] = aggr;          /* first aggregator ID of this request */
+        avail_lens[i] = avail_len;     /* length covered, may be < fd->flat_file.len[i] */
+        assert(aggr >= 0 && aggr <= cb_nodes);
+        my_req[aggr].count++; /* increment for aggregator aggr */
+        nelems++;             /* true number of noncontiguous requests
+                               * in terms of file domains */
+
+        /* rem_len is the amount of ith offset-length pair that is not covered
+         * by aggregator aggr's file domain.
+         */
+        rem_len = fd->flat_file.len[i] - avail_len;
+        assert(rem_len >= 0);
+
+        while (rem_len > 0) {
+            off += avail_len;    /* move forward to first remaining byte */
+            avail_len = rem_len; /* save remaining size, pass to calc */
+            aggr = LUSTRE_Calc_aggregator(fd, off, &avail_len);
+            my_req[aggr].count++;
+            nelems++;
+            rem_len -= avail_len;/* reduce remaining length by amount from fd */
+        }
+    }
+
+    /* allocate space for buf_idx.
+     * buf_idx is relevant only if buftype is contiguous. buf_idx[i] gives the
+     * starting index in user_buf where data will be sent to aggregator 'i'.
+     * This allows sends to be done without extra buffer.
+     */
+    if (buf_idx != NULL && buf_is_contig) {
+        buf_idx[0] = (MPI_Offset *) NCI_Malloc(nelems * sizeof(MPI_Offset));
+        for (i = 1; i < cb_nodes; i++)
+            buf_idx[i] = buf_idx[i - 1] + my_req[i - 1].count;
+    }
+
+    /* allocate space for my_req and its members offsets and lens */
+#ifdef HAVE_MPI_LARGE_COUNT
+    alloc_sz = sizeof(MPI_Offset) * 2;
+    my_req[0].offsets = (MPI_Offset*) NCI_Malloc(alloc_sz * nelems);
+    my_req[0].lens    = my_req[0].offsets + my_req[0].count;
+    for (i=1; i<cb_nodes; i++) {
+        my_req[i].offsets = my_req[i-1].offsets + my_req[i-1].count * 2;
+        my_req[i].lens    = my_req[i].offsets + my_req[i].count;
+        my_req[i-1].count = 0; /* reset, will increase where needed later */
+    }
+    my_req[cb_nodes-1].count = 0;
+#else
+    alloc_sz = sizeof(MPI_Offset) + sizeof(int);
+    my_req[0].offsets = (MPI_Offset*) NCI_Malloc(alloc_sz * nelems);
+    my_req[0].lens    = (int*) (my_req[0].offsets + my_req[0].count);
+
+    char *ptr = (char*) my_req[0].offsets + alloc_sz * my_req[0].count;
+    for (i=1; i<cb_nodes; i++) {
+        my_req[i].offsets = (MPI_Offset*)ptr;
+        ptr += sizeof(MPI_Offset) * my_req[i].count;
+        my_req[i].lens = (int*)ptr;
+        ptr += sizeof(int) * my_req[i].count;
+        my_req[i].count = 0; /* reset, will be incremented where needed later */
+    }
+    my_req[cb_nodes-1].count = 0;
+#endif
+
+    for (i=0; i<cb_nodes; i++)
+        my_req[i].count = 0; /* reset, will be incremented where needed later */
+
+    /* now fill in my_req */
+    curr_idx = 0;
+    for (i = 0; i < fd->flat_file.count; i++) {
+        /* short circuit offset/len processing if zero-byte read/write. */
+        if (fd->flat_file.len[i] == 0)
+            continue;
+
+        off = fd->flat_file.off[i];
+        aggr = aggr_ranks[i];
+        assert(aggr >= 0 && aggr <= cb_nodes);
+        avail_len = avail_lens[i];
+
+        l = my_req[aggr].count;
+        if (buf_idx != NULL && buf_is_contig) {
+            buf_idx[aggr][l] = curr_idx;
+            curr_idx += avail_len;
+        }
+        rem_len = fd->flat_file.len[i] - avail_len;
+
+        /* Each my_req[i] contains the number of this process's noncontiguous
+         * requests that fall into aggregator aggr's file domain.
+         * my_req[aggr].offsets[] and my_req[aggr].lens store the offsets and
+         * lengths of the requests.
+         */
+        my_req[aggr].offsets[l] = off;
+        my_req[aggr].lens[l] = avail_len;
+        my_req[aggr].count++;
+
+        while (rem_len != 0) {
+            off += avail_len;
+            avail_len = rem_len;
+            aggr = LUSTRE_Calc_aggregator(fd, off, &avail_len);
+            assert(aggr >= 0 && aggr <= cb_nodes);
+            l = my_req[aggr].count;
+            if (buf_idx != NULL && buf_is_contig) {
+                buf_idx[aggr][l] = curr_idx;
+                curr_idx += avail_len;
+            }
+            rem_len -= avail_len;
+
+            my_req[aggr].offsets[l] = off;
+            my_req[aggr].lens[l] = avail_len;
+            my_req[aggr].count++;
+        }
+    }
+    NCI_Free(aggr_ranks);
+}
+
+/* LUSTRE_Calc_others_req() calculates what requests from each of other
+ * processes fall in this aggregator's file domain.
+ *   IN: my_req[cb_nodes]: offset-length pairs of this rank's requests fall
+ *       into each of aggregators
+ *   OUT: count_others_req_per_proc[i]: number of noncontiguous requests of
+ *        rank i that falls in this aggregator's file domain.
+ *   OUT: others_req_ptr[nprocs]: requests of each of other ranks fall into
+ *        this aggregator's file domain.
+ */
+static
+void LUSTRE_Calc_others_req(PNCIO_File          *fd,
+                            const PNCIO_Access  *my_req,
+                            PNCIO_Access       **others_req_ptr)
+{
+    int i, myrank, nprocs, do_alltoallv;
+    MPI_Count *count_my_req_per_proc, *count_others_req_per_proc;
+    PNCIO_Access *others_req;
+    size_t npairs, alloc_sz, pair_sz;
+
+    /* first find out how much to send/recv and from/to whom */
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+
+    others_req = (PNCIO_Access *) NCI_Malloc(nprocs * sizeof(PNCIO_Access));
+    *others_req_ptr = others_req;
+
+    /* Use my_req[i].count (the number of noncontiguous requests fall in
+     * aggregator i's file domain) to set count_others_req_per_proc[j] (the
+     * number of noncontiguous requests from process j fall into this
+     * aggregator's file domain).
+     */
+    count_my_req_per_proc = (MPI_Count *) NCI_Calloc(nprocs * 2, sizeof(MPI_Count));
+    count_others_req_per_proc = count_my_req_per_proc + nprocs;
+    for (i=0; i<fd->hints->cb_nodes; i++)
+        count_my_req_per_proc[fd->hints->ranklist[i]] = my_req[i].count;
+
+    MPI_Alltoall(count_my_req_per_proc, 1, MPI_COUNT,
+                 count_others_req_per_proc, 1, MPI_COUNT, fd->comm);
+
+    /* calculate total number of offset-length pairs to be handled by this
+     * aggregator, only aggregators will have non-zero number of pairs.
+     */
+    npairs = 0;
+    for (i=0; i<nprocs; i++) {
+        npairs += count_others_req_per_proc[i];
+        others_req[i].count = count_others_req_per_proc[i];
+        others_req[i].curr = 0;
+    }
+    NCI_Free(count_my_req_per_proc);
+
+    /* The best communication approach for aggregators to collect offset-length
+     * pairs from the non-aggregators is to allocate a single contiguous memory
+     * space for my_req[] to store all its pairs of offsets and lens. The same
+     * for others_req[].
+     */
+#ifdef HAVE_MPI_LARGE_COUNT
+    pair_sz = sizeof(MPI_Offset) * 2;
+    alloc_sz = pair_sz + sizeof(MPI_Count);
+    others_req[0].offsets  = (MPI_Offset*) NCI_Malloc(npairs * alloc_sz);
+    others_req[0].lens     = others_req[0].offsets + others_req[0].count;
+    others_req[0].mem_ptrs = (MPI_Count*) (others_req[0].offsets + npairs * 2);
+    for (i=1; i<nprocs; i++) {
+        others_req[i].offsets  = others_req[i-1].offsets + others_req[i-1].count * 2;
+        others_req[i].lens     = others_req[i].offsets + others_req[i].count;
+        others_req[i].mem_ptrs = others_req[i-1].mem_ptrs + others_req[i-1].count;
+    }
+#else
+    pair_sz = sizeof(MPI_Offset) + sizeof(int);
+    alloc_sz = pair_sz + sizeof(MPI_Aint);
+    others_req[0].offsets  = (MPI_Offset*) NCI_Malloc(npairs * alloc_sz);
+    others_req[0].lens     = (int*) (others_req[0].offsets + others_req[0].count);
+    char *ptr = (char*) others_req[0].offsets + pair_sz * npairs;
+    others_req[0].mem_ptrs = (MPI_Aint*)ptr;
+
+    ptr = (char*) others_req[0].offsets + pair_sz * others_req[0].count;
+    for (i=1; i<nprocs; i++) {
+        others_req[i].offsets = (MPI_Offset*)ptr;
+        ptr += sizeof(MPI_Offset) * others_req[i].count;
+        others_req[i].lens = (int*)ptr;
+        ptr += sizeof(int) * others_req[i].count;
+        others_req[i].mem_ptrs = others_req[i-1].mem_ptrs + others_req[i-1].count;
+    }
+#endif
+
+    /* now send the calculated offsets and lengths to respective processes */
+
+#ifdef CONSIDER_ALLTOALLV
+    /* On Perlmutter at NERSC, when the number of processes per compute node is
+     * large, using MPI_Alltoallv() instead of MPI_Isend/Irecv may avoid
+     * possible hanging.  When hanging occurs, the error messages are
+     * 1. RXC (0x11291:0) PtlTE 397:[Fatal] OVERFLOW buffer list exhausted
+     * 2. MPICH WARNING: OFI is failing to make progress on posting a receive.
+     *    MPICH suspects a hang due to completion queue exhaustion. Setting
+     *    environment variable FI_CXI_DEFAULT_CQ_SIZE to a higher number might
+     *    circumvent this scenario. OFI retry continuing...
+     *
+     * Below use a threshold of 48, number of processes per compute node.
+     */
+    do_alltoallv = (fd->num_nodes > 0) ? (nprocs / fd->num_nodes > 48) : 0;
+#else
+    do_alltoallv=0;
+#endif
+
+    if (do_alltoallv) {
+        MPI_Offset *r_off_buf=NULL, *s_off_buf=NULL;
+#ifdef HAVE_MPI_LARGE_COUNT
+        MPI_Count *sendCounts, *recvCounts;
+        MPI_Aint *sdispls, *rdispls;
+        alloc_sz   = sizeof(MPI_Count) * 2 + sizeof(MPI_Aint) * 2;
+        sendCounts = (MPI_Count*) NCI_Calloc(nprocs, alloc_sz);
+        recvCounts = sendCounts + nprocs;
+        sdispls    = (MPI_Aint*) (recvCounts + nprocs);
+        rdispls    = sdispls + nprocs;
+#else
+        int *sendCounts, *recvCounts, *sdispls, *rdispls;
+        alloc_sz   = sizeof(int) * 4;
+        sendCounts = (int*) NCI_Calloc(nprocs, alloc_sz);
+        recvCounts = sendCounts + nprocs;
+        sdispls    = recvCounts + nprocs;
+        rdispls    = sdispls + nprocs;
+#endif
+
+        /* prepare receive side */
+        r_off_buf = others_req[0].offsets;
+        for (i=0; i<nprocs; i++) {
+            recvCounts[i] = others_req[i].count * pair_sz;
+            /* Note all others_req[*].offsets are allocated in a single malloc(). */
+            rdispls[i] = (char*)others_req[i].offsets - (char*)r_off_buf;
+        }
+
+        /* prepare send side */
+        s_off_buf = my_req[0].offsets;
+        for (i=0; i<fd->hints->cb_nodes; i++) {
+            int dest = fd->hints->ranklist[i];
+            sendCounts[dest] = my_req[i].count * pair_sz;
+            /* Note all my_req[*].offsets are allocated in a single malloc(). */
+            sdispls[dest] = (char*)my_req[i].offsets - (char*)s_off_buf;
+        }
+
+#ifdef HAVE_MPI_LARGE_COUNT
+        MPI_Alltoallv_c(s_off_buf, sendCounts, sdispls, MPI_BYTE,
+                        r_off_buf, recvCounts, rdispls, MPI_BYTE, fd->comm);
+#else
+        MPI_Alltoallv(s_off_buf, sendCounts, sdispls, MPI_BYTE,
+                      r_off_buf, recvCounts, rdispls, MPI_BYTE, fd->comm);
+#endif
+
+        NCI_Free(sendCounts);
+    }
+    else { /* instead of using alltoall, use MPI_Issend and MPI_Irecv */
+        int nreqs;
+        MPI_Request *requests = (MPI_Request *)
+            NCI_Malloc((nprocs + fd->hints->cb_nodes) * sizeof(MPI_Request));
+
+        nreqs = 0;
+        for (i = 0; i < nprocs; i++) {
+            if (others_req[i].count == 0) /* nothing to receive from rank i */
+                continue;
+
+            /* Note the memory address of others_req[i].lens is right after
+             * others_req[i].offsets. This allows the following recv call to
+             * receive both offsets and lens in a single call.
+             */
+            if (i == myrank) {
+                /* send to self uses memcpy(), here
+                 * others_req[i].count == my_req[fd->my_cb_nodes_index].count
+                 */
+                memcpy(others_req[i].offsets,
+                       my_req[fd->my_cb_nodes_index].offsets,
+                       my_req[fd->my_cb_nodes_index].count * pair_sz);
+            }
+            else {
+#ifdef HAVE_MPI_LARGE_COUNT
+                MPI_Irecv_c(others_req[i].offsets, others_req[i].count*pair_sz,
+                          MPI_BYTE, i, 0, fd->comm, &requests[nreqs++]);
+#else
+                MPI_Irecv(others_req[i].offsets, others_req[i].count*pair_sz,
+                          MPI_BYTE, i, 0, fd->comm, &requests[nreqs++]);
+#endif
+            }
+        }
+
+#ifdef WKL_DEBUG
+/* WRF hangs below when calling MPI_Waitall(), at running 16 nodes, 128 ranks
+ * per node on Perlmutter, when these 3 env variables are set:
+ *    FI_UNIVERSE_SIZE        = 2048
+ *    FI_CXI_DEFAULT_CQ_SIZE  = 524288
+ *    FI_CXI_RX_MATCH_MODE    = software
+ *
+ * Using MPI_Alltoallv seems to be able to avoid such hanging problem. (above)
+ */
+// MPI_Barrier(fd->comm); /* This barrier prevents the MPI_Waitall below from hanging !!! */
+#endif
+
+        for (i=0; i<fd->hints->cb_nodes; i++) {
+            if (my_req[i].count == 0 || i == fd->my_cb_nodes_index)
+                continue; /* nothing to send or send to self */
+
+            /* Note the memory address of my_req[i].lens is right after
+             * my_req[i].offsets. This allows the following Issend call to
+             * send both offsets and lens in a single call.
+             */
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Issend_c(my_req[i].offsets, my_req[i].count * pair_sz, MPI_BYTE,
+                       fd->hints->ranklist[i], 0, fd->comm, &requests[nreqs++]);
+#else
+            MPI_Issend(my_req[i].offsets, my_req[i].count * pair_sz, MPI_BYTE,
+                       fd->hints->ranklist[i], 0, fd->comm, &requests[nreqs++]);
+#endif
+        }
+
+        if (nreqs) {
+#ifdef HAVE_MPI_STATUSES_IGNORE
+            MPI_Waitall(nreqs, requests, MPI_STATUSES_IGNORE);
+#else
+            MPI_Status *statuses = (MPI_Status *)
+                                   NCI_Malloc(nreqs * sizeof(MPI_Status));
+            MPI_Waitall(nreqs, requests, statuses);
+            NCI_Free(statuses);
+#endif
+        }
+        NCI_Free(requests);
+    }
+}
+
+MPI_Offset PNCIO_LUSTRE_WriteStridedColl(PNCIO_File *fd,
+                                         const void *buf,
+                                         PNCIO_View  buf_view,
+                                         MPI_Offset  offset)
+{
+    /* Uses a generalized version of the extended two-phase method described in
+     * "An Extended Two-Phase Method for Accessing Sections of Out-of-Core
+     * Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming,
+     * (5)4:301--317, Winter 1996.
+     * http://www.mcs.anl.gov/home/thakur/ext2ph.ps
+     */
+
+    int i, j, nprocs, myrank;
+    int do_collect = 1, do_ex_wr;
+    MPI_Offset start_offset, end_offset;
+    MPI_Offset min_st_loc = -1, max_end_loc = -1;
+    MPI_Offset w_len=0;
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+
+// printf("%s %d: offset=%lld\n",__func__,__LINE__,offset);
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+MPI_Barrier(fd->comm);
+double curT = MPI_Wtime();
+#endif
+
+    /* fd->flat_file contains a list of starting file offsets and lengths of
+     * write requests made by this rank. Similarly, buf_view contains a list of
+     * offset-length pairs describing the write buffer layout.  Note as PnetCDF
+     * never re-uses a fileview or buffer view.
+     *
+     * Note that MPI standard (MPI 3.1 Chapter 13.1.1 and MPI 4.0 Chapter
+     * 14.1.1) requires that the typemap displacements of etype and filetype
+     * set by the user are non-negative and monotonically non-decreasing. This
+     * makes fd->flat_file.off[] to be monotonically non-decreasing.
+     *
+     * This rank's aggregate file access region is from start_offset to
+     * end_offset. Note: end_offset points to the last byte-offset to be
+     * accessed. E.g., if start_offset=0 and end_offset=99, then the aggregate
+     * file access region is of size 100 bytes. If this rank has no data to
+     * write, end_offset == (start_offset - 1)
+     */
+    MPI_Offset one_off;
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Offset one_len;
+#else
+    int one_len;
+#endif
+
+    if (fd->flat_file.count == 0) { /* TODO: is fd->flat_file.count == 0? */
+        /* whole file is visible */
+        start_offset = offset;
+        end_offset = offset + buf_view.size - 1;
+        if (buf_view.size > 0) { /* no-zero sized request */
+            /* setting fd->flat_file is necessary for constructing my_req */
+            one_off = offset;
+            one_len = buf_view.size;
+            fd->flat_file.off = &one_off;
+            fd->flat_file.len = &one_len;
+            fd->flat_file.count = 1;
+        }
+    }
+    else {
+        start_offset = offset + fd->flat_file.off[0];
+        end_offset   = fd->flat_file.off[fd->flat_file.count-1]
+                     + fd->flat_file.len[fd->flat_file.count-1] - 1;
+    }
+/*
+    else if (fd->flat_file.count > 0) {
+        start_offset = offset + fd->flat_file.off[0];
+        end_offset   = fd->flat_file.off[fd->flat_file.count-1]
+                     + fd->flat_file.len[fd->flat_file.count-1] - 1;
+    }
+    else {
+        start_offset = offset;
+        end_offset   = offset + fd->flat_file.size - 1;
+    }
+*/
+// if (myrank==0) printf("%s %d: fd->flat_file size=%lld count=%lld offset=%lld start_offset=%lld end_offset=%lld\n",__func__,__LINE__, fd->flat_file.size, fd->flat_file.count,offset,start_offset,end_offset);
+
+    buf_view.idx  = 0;
+    buf_view.rem = buf_view.size;
+    if (buf_view.count > 1)
+        buf_view.rem = buf_view.len[0];
+
+    if (fd->hints->cb_write == PNCIO_HINT_DISABLE) {
+        /* collective write is explicitly disabled by user */
+        do_collect = 0;
+    }
+    else {
+        /* Calculate the aggregate access region of all ranks and check if
+         * write requests are interleaved among all ranks.
+         */
+        int is_interleaved, large_indv_req = 1;
+        MPI_Offset striping_range, st_end[2], *st_end_all = NULL;
+
+        /* Gather starting and ending file offsets of write requests from all
+         * ranks into st_end_all[]. Even indices of st_end_all[] are starting
+         * offsets, and odd indices are ending offsets.
+         */
+        st_end[0] = start_offset;
+        st_end[1] = end_offset;
+        st_end_all = (MPI_Offset *) NCI_Malloc(nprocs * 2 * sizeof(MPI_Offset));
+        MPI_Allgather(st_end, 2, MPI_OFFSET, st_end_all, 2, MPI_OFFSET, fd->comm);
+
+        /* The loop below does the followings.
+         * 1. Calculate this rank's aggregate access region.
+         * 2. Check whether or not the requests are interleaved among all ranks.
+         * 3. Check whether there are LARGE individual requests. Here, "large"
+         *    means a write range is > (striping_factor * striping_unit). In
+         *    this case, independent write will perform faster than collective.
+         */
+        striping_range = fd->hints->striping_unit * fd->hints->striping_factor;
+        is_interleaved = 0;
+        for (i = 0; i < nprocs * 2; i += 2) {
+            if (st_end_all[i] > st_end_all[i + 1]) {
+                /* process rank (i/2) has no data to write */
+                continue;
+            }
+            min_st_loc = st_end_all[i];
+            max_end_loc = st_end_all[i + 1];
+            if (st_end_all[i+1] - st_end_all[i] < striping_range)
+                large_indv_req = 0;
+            j = i; /* j is the rank of making first non-zero request */
+            i += 2;
+            break;
+        }
+        for (; i < nprocs * 2; i += 2) {
+            if (st_end_all[i] > st_end_all[i + 1]) {
+                /* process rank (i/2) has no data to write */
+                continue;
+            }
+            if (st_end_all[i] < st_end_all[j+1]) {
+                /* start offset of process rank (i/2) is less than the end
+                 * offset of process rank (i/2-1)
+                 */
+                is_interleaved = 1;
+            }
+            min_st_loc = MIN(st_end_all[i], min_st_loc);
+            max_end_loc = MAX(st_end_all[i + 1], max_end_loc);
+            if (st_end_all[i+1] - st_end_all[i] < striping_range)
+                large_indv_req = 0;
+            j = i;
+        }
+        NCI_Free(st_end_all);
+
+// if (myrank==0) printf("%s %d: do_collect=%d is_interleaved=%d buf_view size=%lld count=%lld is_contig=%d start_offset=%lld end_offset=%lld\n",__func__,__LINE__, do_collect,is_interleaved,buf_view.size,buf_view.count,buf_view.is_contig, start_offset,end_offset);
+        if (fd->hints->cb_write == PNCIO_HINT_ENABLE) {
+            /* explicitly enabled by user */
+            do_collect = 1;
+        }
+        else if (fd->hints->cb_write == PNCIO_HINT_AUTO) {
+// if (myrank==0) printf("%s %d: large_indv_req=%d cb_nodes=%d striping_factor=%d\n",__func__,__LINE__, large_indv_req,fd->hints->cb_nodes , fd->hints->striping_factor);
+            /* Check if collective write is actually necessary, only when
+             * cb_write hint is set to PNCIO_HINT_AUTO.
+             *
+             * Two typical access patterns can benefit from collective write.
+             *   1) access file regions of all processes are interleaved, and
+             *   2) the individual request sizes are not too big, i.e. no
+             *      bigger than striping_range. Large individual requests may
+             *      result in a high communication cost in order to
+             *      redistribute requests from non-aggregators to I/O
+             *      aggregators.
+             */
+            if (nprocs == 1)
+                do_collect = 0;
+            else if (!is_interleaved && large_indv_req &&
+                     fd->hints->cb_nodes <= fd->hints->striping_factor) {
+                /* do independent write, if every rank's write range >
+                 * striping_range and writes are not interleaved in file
+                 * space
+                 */
+                do_collect = 0;
+            }
+        }
+    }
+
+    /* If collective I/O is determined not necessary, use independent I/O */
+    if (!do_collect) {
+
+        if (buf_view.size == 0) /* zero-sized request */
+            return 0;
+
+        if (fd->flat_file.is_contig && buf_view.is_contig) {
+            /* both buffer and fileview are contiguous */
+            if (fd->flat_file.count > 0) offset += fd->flat_file.off[0];
+#ifdef WKL_DEBUG
+            printf("%s %d: SWITCH to PNCIO_WriteContig !!!\n",__func__,__LINE__);
+#endif
+
+            return PNCIO_WriteContig(fd, buf, buf_view.size, offset);
+        }
+
+#ifdef WKL_DEBUG
+        printf("%s %d: SWITCH to PNCIO_LUSTRE_WriteStrided !!!\n",
+                   __func__,__LINE__);
+#endif
+
+        return PNCIO_LUSTRE_WriteStrided(fd, buf, buf_view, offset);
+    }
+
+    /* Now we are using collective I/O (two-phase I/O strategy) */
+
+#ifdef ADJUST_STRIPING_UNIT
+    /* adjust striping_unit when striping_factor is twice or more than the
+     * number of compute nodes. Note cb_node is set to at least
+     * striping_factor, if nprocs >= striping_factor. Adjustment below is to
+     * let each aggregator to write to two or more consecutive OSTs, which can
+     * most likely improve the performance. This will still yield an effect of
+     * any one OST receiving write requests from aggregators running on only
+     * one compute node.
+     */
+    int orig_striping_unit = fd->hints->striping_unit;
+
+    if (fd->hints->striping_factor >= fd->num_nodes * 2) {
+        fd->hints->striping_unit *= (fd->hints->striping_factor / fd->num_nodes);
+
+        if (fd->hints->cb_buffer_size < fd->hints->striping_unit) {
+            char value[MPI_MAX_INFO_VAL + 1];
+
+            fd->hints->cb_buffer_size = fd->hints->striping_unit;
+            sprintf(value, "%d", fd->hints->cb_buffer_size);
+            MPI_Info_set(fd->info, "cb_buffer_size", value);
+            if (fd->is_agg) {
+                NCI_Free(fd->io_buf);
+                fd->io_buf = (void*) NCI_Calloc(1, fd->hints->cb_buffer_size);
+            }
+        }
+#ifdef WKL_DEBUG
+        if (myrank == 0)
+            printf("Warning: %s line %d: Change striping_unit from %d to %d\n",
+                   __func__, __LINE__, orig_striping_unit, fd->hints->striping_unit);
+#endif
+    }
+#endif
+
+    /* my_req[cb_nodes] is an array of access info, one for each I/O aggregator
+     * whose file domain has this rank's request.
+     */
+    PNCIO_Access *my_req;
+
+    /* others_req[nprocs] is an array of access info, one for each ranks, both
+     * aggregators and non-aggregators, whose write requests fall into this
+     * aggregator's file domain. others_req[] matters only for aggregators.
+     */
+    PNCIO_Access *others_req;
+    MPI_Offset **buf_idx = NULL;
+
+    if (buf_view.is_contig)
+        buf_idx = (MPI_Offset **) NCI_Malloc(fd->hints->cb_nodes *
+                                                sizeof(MPI_Offset*));
+
+    /* Calculate the portions of this rank's write requests that fall into the
+     * file domains of each I/O aggregator. No inter-process communication is
+     * performed in LUSTRE_Calc_my_req().
+     */
+    LUSTRE_Calc_my_req(fd, buf_view.is_contig, &my_req, buf_idx);
+
+    if (fd->hints->ds_write != PNCIO_HINT_DISABLE) {
+        /* When data sieving is considered, below check the current file size
+         * first. If the aggregate access region of this collective write is
+         * beyond the current file size, then we can safely skip the read of
+         * the read-modify-write of data sieving.
+         */
+        if (fd->is_agg) {
+            /* Obtain the current file size. Note an MPI_Allgather() has been
+             * called above to calculate the aggregate access region. Thus all
+             * prior independent I/O should have completed by now, so it is
+             * safe to call lseek() to query the file size.
+             */
+            MPI_Offset cur_off, fsize;
+
+            cur_off = lseek(fd->fd_sys, 0, SEEK_CUR);
+            fsize   = lseek(fd->fd_sys, 0, SEEK_END);
+            /* Ignore the error, and proceed as if file size is very large. */
+#ifdef PNETCDF_DEBUG
+            if (fsize == -1)
+                fprintf(stderr, "%s at %d: lseek SEEK_END failed on file %s (%s)\n",
+                        __func__,__LINE__, fd->filename, strerror(errno));
+#endif
+            fd->skip_read = (fsize >=0 && min_st_loc >= fsize);
+
+            /* restore file pointer */
+            lseek(fd->fd_sys, cur_off, SEEK_SET);
+        }
+    }
+    else
+        fd->skip_read = 1;
+
+// if (fd->is_agg && !fd->skip_read) { MPI_Offset fsize = lseek(fd->fd_sys, 0, SEEK_END); printf("%d: %s at %d: skip_read=%d min_st_loc=%lld fsize=%lld\n",myrank,__func__,__LINE__,fd->skip_read,min_st_loc,fsize); }
+
+    /* For aggregators, calculate the portions of all other ranks' requests
+     * fall into this aggregator's file domain (note only I/O aggregators are
+     * assigned file domains).
+     *
+     * Inter-process communication is required to construct others_req[],
+     * including MPI_Alltoall, MPI_Issend, MPI_Irecv, and MPI_Waitall.
+     */
+    LUSTRE_Calc_others_req(fd, my_req, &others_req);
+
+    /* Two-phase I/O: first communication phase to exchange write data from all
+     * ranks to the I/O aggregators, followed by the write phase where only I/O
+     * aggregators write to the file.
+     *
+     * Unless MPI_Alltoallw() is used (when use_alltoallw is set to 1), there
+     * is no collective MPI communication beyond this point, as
+     * LUSTRE_Exch_and_write() calls only MPI_Issend, MPI_Irecv, and
+     * MPI_Waitall. Thus it is safe for those non-aggregators making zero-sized
+     * request to skip the call.
+     */
+
+    /* if this rank has data to write, then participate exchange-and-write */
+    do_ex_wr = (buf_view.size == 0) ? 0 : 1;
+    use_alltoallw = 0;
+
+#ifdef USE_MPI_ALLTOALLW
+    {
+        /* When num_nodes < striping_factor, using MPI_Alltoallw in
+         * commit_comm_phase() is faster than MPI_Issend/MPI_Irecv ... ?
+         */
+        char *env_str;
+        if ((env_str = getenv("PNETCDF_USE_ALLTOALLW")) != NULL)
+            use_alltoallw = (strcasecmp(env_str, "true") == 0) ? 1: 0;
+    }
+#endif
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    if (fd->is_agg) fd->write_timing[1] += MPI_Wtime() - curT;
+#endif
+
+    if (do_ex_wr || fd->is_agg)
+        /* This rank participates exchange and write only when it has non-zero
+         * data to write or is an I/O aggregator
+         */
+        w_len = LUSTRE_Exch_and_write(fd, buf, buf_view, others_req, my_req,
+                                      min_st_loc, max_end_loc, buf_idx);
+
+    /* free all memory allocated */
+    NCI_Free(others_req[0].offsets);
+    NCI_Free(others_req);
+
+    if (buf_idx != NULL) {
+        NCI_Free(buf_idx[0]);
+        NCI_Free(buf_idx);
+    }
+    NCI_Free(my_req[0].offsets);
+    NCI_Free(my_req);
+
+#ifdef ADJUST_STRIPING_UNIT
+    /* restore the original striping_unit */
+    fd->hints->striping_unit = orig_striping_unit;
+#endif
+
+    /* If this collective write is followed by an independent write, it's
+     * possible to have those subsequent writes on other processes race ahead
+     * and sneak in before the read-modify-write completes.  We carry out a
+     * collective communication at the end here so no one can start independent
+     * I/O before collective I/O completes.
+     *
+     * need to do some gymnastics with the error codes so that if something
+     * went wrong, all processes report error, but if a process has a more
+     * specific error code, we can still have that process report the
+     * additional information
+     */
+    /* optimization: if only one process performing I/O, we can perform
+     * a less-expensive Bcast. */
+    if (fd->hints->cb_nodes == 1)
+        MPI_Bcast(&w_len, 1, MPI_OFFSET, fd->hints->ranklist[0], fd->comm);
+    else
+        MPI_Allreduce(MPI_IN_PLACE, &w_len, 1, MPI_OFFSET, MPI_MIN, fd->comm);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    if (fd->is_agg) fd->write_timing[0] += MPI_Wtime() - curT;
+#endif
+
+    /* w_len may not be the same as buf_view.size, because data sieving may
+     * write more than requested.
+     */
+    return buf_view.size;
+}
+
+static
+void comm_phase_alltoallw(PNCIO_File    *fd,
+                          disp_len_list *send_list,  /* [cb_nodes] */
+                          disp_len_list *recv_list)  /* [nprocs] */
+{
+    /* This subroutine performs the sam communication tasks as the below
+     * commit_comm_phase(), but using MPI_Alltoallw() instead of MPI_Issend and
+     * MPI_Irecv.
+     *
+     * It creates a datatype combining all displacement-length
+     * pairs in each element of send_list[]. The datatype is used when calling
+     * MPI_Issend to send write data to the I/O aggregators. Similarly, it
+     * creates a datatype combining all displacement-length pairs in each
+     * element of recv_list[] and uses it when calling MPI_Irecv or MPI_Recv
+     * to receive write data from all processes.
+     */
+    int i, nprocs, rank;
+    size_t alloc_sz;
+    MPI_Datatype *sendTypes, *recvTypes;
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &rank);
+
+    /* calculate send/recv derived types metadata */
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Count *sendCounts, *recvCounts;
+    MPI_Aint *sdispls, *rdispls;
+    alloc_sz = sizeof(MPI_Count) + sizeof(MPI_Aint);
+    sendCounts = (MPI_Count*) NCI_Calloc(nprocs * 2, alloc_sz);
+    sdispls = (MPI_Aint*) (sendCounts + (nprocs * 2));
+#else
+    int *sendCounts, *recvCounts, *sdispls, *rdispls;
+    alloc_sz = sizeof(int) * 2;
+    sendCounts = (int*) NCI_Calloc(nprocs * 2, alloc_sz);
+    sdispls = (int*) (sendCounts + (nprocs * 2));
+#endif
+    recvCounts = sendCounts + nprocs;
+    rdispls = sdispls + nprocs;
+
+    /* allocate send/recv derived type arrays */
+    sendTypes = (MPI_Datatype*)NCI_Malloc(sizeof(MPI_Datatype) * nprocs * 2);
+    recvTypes = sendTypes + nprocs;
+
+    for (i=0; i<nprocs; i++)
+        sendTypes[i] = recvTypes[i] = MPI_BYTE;
+
+    /* prepare receive side: construct recv derived data types */
+    if (fd->is_agg && recv_list != NULL) {
+        for (i=0; i<nprocs; i++) {
+            /* check if nothing to receive or if self */
+            if (recv_list[i].count == 0 || i == rank) continue;
+
+            recvCounts[i] = 1;
+
+            /* combine reqs using new datatype */
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Type_create_hindexed_c(recv_list[i].count, recv_list[i].len,
+                                       recv_list[i].disp, MPI_BYTE,
+                                       &recvTypes[i]);
+#else
+            MPI_Type_create_hindexed(recv_list[i].count, recv_list[i].len,
+                                     recv_list[i].disp, MPI_BYTE,
+                                     &recvTypes[i]);
+#endif
+            MPI_Type_commit(&recvTypes[i]);
+        }
+    }
+
+    /* prepare send side: construct send derived data types */
+    for (i=0; i<fd->hints->cb_nodes; i++) {
+        /* check if nothing to send or if self */
+        if (send_list[i].count == 0 || i == fd->my_cb_nodes_index) continue;
+
+        int dest = fd->hints->ranklist[i];
+        sendCounts[dest] = 1;
+
+        /* combine reqs using new datatype */
+#ifdef HAVE_MPI_LARGE_COUNT
+        MPI_Type_create_hindexed_c(send_list[i].count, send_list[i].len,
+                                   send_list[i].disp, MPI_BYTE,
+                                   &sendTypes[dest]);
+#else
+        MPI_Type_create_hindexed(send_list[i].count, send_list[i].len,
+                                 send_list[i].disp, MPI_BYTE,
+                                 &sendTypes[dest]);
+#endif
+        MPI_Type_commit(&sendTypes[dest]);
+    }
+
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Alltoallw_c(MPI_BOTTOM, sendCounts, sdispls, sendTypes,
+                    MPI_BOTTOM, recvCounts, rdispls, recvTypes, fd->comm);
+#else
+    MPI_Alltoallw(MPI_BOTTOM, sendCounts, sdispls, sendTypes,
+                  MPI_BOTTOM, recvCounts, rdispls, recvTypes, fd->comm);
+#endif
+
+    for (i=0; i<nprocs; i++) {
+        if (sendTypes[i] != MPI_BYTE)
+            MPI_Type_free(&sendTypes[i]);
+        if (recvTypes[i] != MPI_BYTE)
+            MPI_Type_free(&recvTypes[i]);
+    }
+    NCI_Free(sendCounts);
+    NCI_Free(sendTypes);
+
+    /* clear send_list and recv_list for future reuse */
+    for (i = 0; i < fd->hints->cb_nodes; i++)
+        send_list[i].count = 0;
+
+    if (recv_list != NULL)
+        for (i = 0; i < nprocs; i++)
+            recv_list[i].count = 0;
+}
+
+static
+void commit_comm_phase(PNCIO_File    *fd,
+                       disp_len_list *send_list,  /* [cb_nodes] */
+                       disp_len_list *recv_list)  /* [nprocs] */
+{
+    /* This subroutine creates a datatype combining all displacement-length
+     * pairs in each element of send_list[]. The datatype is used when calling
+     * MPI_Issend to send write data to the I/O aggregators. Similarly, it
+     * creates a datatype combining all displacement-length pairs in each
+     * element of recv_list[] and uses it when calling MPI_Irecv or MPI_Recv
+     * to receive write data from all processes.
+     */
+    int i, nprocs, rank, nreqs;
+    MPI_Request *reqs;
+    MPI_Datatype sendType, recvType;
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    int j;
+    double dtype_time=MPI_Wtime();
+#endif
+
+    if (use_alltoallw)
+        return comm_phase_alltoallw(fd, send_list, recv_list);
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &rank);
+
+    nreqs = fd->hints->cb_nodes;
+    nreqs += (fd->is_agg) ? nprocs : 0;
+    reqs = (MPI_Request *)NCI_Malloc(sizeof(MPI_Request) * nreqs);
+    nreqs = 0;
+
+    /* receiving part */
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    /* recv buffer type profiling */
+    int nrecvs=0;
+    MPI_Offset max_r_amnt=0, max_r_count=0;
+#endif
+
+    if (fd->is_agg && recv_list != NULL) {
+        for (i = 0; i < nprocs; i++) {
+            /* check if nothing to receive or if self */
+            if (recv_list[i].count == 0 || i == rank) continue;
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+            MPI_Offset r_amnt=0;
+            for (j=0; j<recv_list[i].count; j++)
+                r_amnt += recv_list[i].len[j];
+            max_r_amnt = MAX(max_r_amnt, r_amnt);
+            max_r_count = MAX(max_r_count, recv_list[i].count);
+            nrecvs++;
+#endif
+
+            /* combine reqs using new datatype */
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Type_create_hindexed_c(recv_list[i].count, recv_list[i].len,
+                                       recv_list[i].disp, MPI_BYTE,
+                                       &recvType);
+#else
+            MPI_Type_create_hindexed(recv_list[i].count, recv_list[i].len,
+                                     recv_list[i].disp, MPI_BYTE,
+                                     &recvType);
+#endif
+            MPI_Type_commit(&recvType);
+
+            if (fd->atomicity) { /* Blocking Recv */
+                MPI_Status status;
+                MPI_Recv(MPI_BOTTOM, 1, recvType, i, 0, fd->comm, &status);
+            }
+            else
+                MPI_Irecv(MPI_BOTTOM, 1, recvType, i, 0, fd->comm,
+                          &reqs[nreqs++]);
+            MPI_Type_free(&recvType);
+        }
+    }
+
+    /* send reqs */
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    /* send buffer type profiling */
+    int nsends=0;
+    MPI_Offset max_s_amnt=0, max_s_count=0;
+#endif
+
+    for (i = 0; i < fd->hints->cb_nodes; i++) {
+        /* check if nothing to send or if self */
+        if (send_list[i].count == 0 || i == fd->my_cb_nodes_index) continue;
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        MPI_Offset s_amnt=0;
+        for (j=0; j<send_list[i].count; j++)
+            s_amnt += send_list[i].len[j];
+        max_s_amnt = MAX(max_s_amnt, s_amnt);
+        max_s_count = MAX(max_s_count, send_list[i].count);
+        nsends++;
+#endif
+
+        /* combine reqs using new datatype */
+#ifdef HAVE_MPI_LARGE_COUNT
+        MPI_Type_create_hindexed_c(send_list[i].count, send_list[i].len,
+                                   send_list[i].disp, MPI_BYTE, &sendType);
+#else
+        MPI_Type_create_hindexed(send_list[i].count, send_list[i].len,
+                                 send_list[i].disp, MPI_BYTE, &sendType);
+#endif
+        MPI_Type_commit(&sendType);
+
+        MPI_Issend(MPI_BOTTOM, 1, sendType, fd->hints->ranklist[i], 0,
+                   fd->comm, &reqs[nreqs++]);
+        MPI_Type_free(&sendType);
+    }
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    fd->write_timing[4] += MPI_Wtime() - dtype_time;
+
+/*
+    fd->write_counter[2] = MAX(fd->write_counter[2], nsends);
+    fd->write_counter[3] = MAX(fd->write_counter[3], nrecvs);
+    fd->write_counter[4] = MAX(fd->write_counter[4], max_r_amnt);
+    fd->write_counter[5] = MAX(fd->write_counter[5], max_s_amnt);
+    fd->write_counter[6] = MAX(fd->write_counter[6], max_r_count);
+    fd->write_counter[7] = MAX(fd->write_counter[7], max_s_count);
+*/
+#endif
+
+    if (nreqs > 0) {
+#ifdef HAVE_MPI_STATUSES_IGNORE
+        MPI_Waitall(nreqs, reqs, MPI_STATUSES_IGNORE);
+#else
+        MPI_Status *statuses = (MPI_Status *)
+                               NCI_Malloc(nreqs * sizeof(MPI_Status));
+        MPI_Waitall(nreqs, reqs, statuses);
+        NCI_Free(statuses);
+#endif
+    }
+
+    NCI_Free(reqs);
+
+    /* clear send_list and recv_list for future reuse */
+    for (i = 0; i < fd->hints->cb_nodes; i++)
+        send_list[i].count = 0;
+
+    if (recv_list != NULL)
+        for (i = 0; i < nprocs; i++)
+            recv_list[i].count = 0;
+}
+
+/*----< LUSTRE_Exch_and_write() >--------------------------------------------*/
+/* Each process sends all its write requests to I/O aggregators based on the
+ * file domain assignment to the aggregators. In this implementation, a file is
+ * first divided into stripes which are assigned to the aggregators in a
+ * round-robin fashion. The "exchange" of write data from non-aggregators to
+ * aggregators is carried out in 'ntimes' rounds. Each round covers an
+ * aggregate file region of size equal to the file stripe size times the number
+ * of I/O aggregators. The file writes are carried out in every 'nbufs'
+ * iterations, where 'nbufs' == cb_buffer_size / file stripe size. This approach
+ * is different from ROMIO's implementation as in MPICH 4.2.3.
+ *
+ * Other implementations developers are referring to the paper: Wei-keng Liao,
+ * and Alok Choudhary. "Dynamically Adapting File Domain Partitioning Methods
+ * for Collective I/O Based on Underlying Parallel File System Locking
+ * Protocols", in The Supercomputing Conference, 2008.
+ */
+static
+MPI_Offset LUSTRE_Exch_and_write(PNCIO_File    *fd,
+                                 const void    *buf,
+                                 PNCIO_View     buf_view,
+                                 PNCIO_Access  *others_req,
+                                 PNCIO_Access  *my_req,
+                                 MPI_Offset     min_st_loc,
+                                 MPI_Offset     max_end_loc,
+                                 MPI_Offset   **buf_idx)
+{
+    char **write_buf = NULL, **recv_buf = NULL, **send_buf = NULL;
+    size_t alloc_sz;
+    int nprocs, myrank, nbufs, ibuf, batch_idx=0, cb_nodes, striping_unit;
+    MPI_Count i, j, m, ntimes;
+    MPI_Count **recv_size=NULL, **recv_count=NULL;
+    MPI_Count **recv_start_pos=NULL, *send_size;
+    MPI_Offset end_loc, req_off, iter_end_off, *off_list, step_size;
+    MPI_Offset *this_buf_idx=NULL;
+    off_len_list *srt_off_len = NULL;
+    disp_len_list *send_list = NULL, *recv_list = NULL;
+    MPI_Offset w_len, total_w_len=0;
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+
+    cb_nodes = fd->hints->cb_nodes;
+    striping_unit = fd->hints->striping_unit;
+
+    /* The aggregate access region (across all processes) of this collective
+     * write starts from min_st_loc and ends at max_end_loc. The collective
+     * write is carried out in 'ntimes' rounds of two-phase I/O. Each round
+     * covers an aggregate file region of size 'step_size' written only by
+     * cb_nodes number of I/O aggregators. Note non-aggregators must also
+     * participate all ntimes rounds to send their requests to I/O aggregators.
+     *
+     * step_size = the number of I/O aggregators x striping_unit
+     *
+     * Note the number of write phases = ntimes / nbufs, as writes (and
+     * communication) are accumulated for nbufs rounds before flushed.
+     */
+    step_size = (MPI_Offset)cb_nodes * striping_unit;
+
+    /* align min_st_loc downward to the nearest file stripe boundary */
+    min_st_loc -= min_st_loc % (MPI_Offset) striping_unit;
+
+    /* ntimes is the number of rounds of two-phase I/O */
+    ntimes = (max_end_loc - min_st_loc + 1) / step_size;
+    if ((max_end_loc - min_st_loc + 1) % step_size)
+        ntimes++;
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    fd->write_counter[0] = MAX(fd->write_counter[0], ntimes);
+#endif
+
+    /* collective buffer is divided into 'nbufs' sub-buffers. Each sub-buffer
+     * is of size equal to Lustre stripe size. Write data of non-aggregators
+     * are sent to aggregators and stored in aggregators' sub-buffers, one for
+     * each round. All nbufs sub-buffers are altogether flushed to file every
+     * nbufs rounds.
+     *
+     * fd->hints->cb_buffer_size, collective buffer size, for Lustre must be at
+     * least striping_unit. This requirement has been checked at the file
+     * open/create time when fd->io_buf is allocated.
+     *
+     * Note cb_buffer_size and striping_unit may also be adjusted earlier in
+     * PNCIO_LUSTRE_WriteStridedColl().
+     */
+    nbufs = fd->hints->cb_buffer_size / striping_unit;
+    assert(nbufs > 0); /* must at least 1 */
+
+    /* in case number of rounds is less than nbufs */
+    nbufs = (ntimes < nbufs) ? (int)ntimes : nbufs;
+
+    /* off_list[m] is the starting file offset of this aggregator's write
+     *     region in iteration m (file domain of iteration m). This offset
+     *     may not be aligned with file stripe boundaries.
+     * end_loc is the ending file offset of this aggregator's file domain.
+     */
+    off_list = (MPI_Offset *) NCI_Malloc(ntimes * sizeof(MPI_Offset));
+    end_loc = -1;
+    for (m = 0; m < ntimes; m++)
+        off_list[m] = max_end_loc;
+    for (i = 0; i < nprocs; i++) {
+// if (myrank == 0) printf("%s at %d: others_req[%d] count=%lld\n",__func__,__LINE__, i,others_req[i].count);
+        for (j = 0; j < others_req[i].count; j++) {
+            req_off = others_req[i].offsets[j];
+            m = (int) ((req_off - min_st_loc) / step_size);
+            off_list[m] = MIN(off_list[m], req_off);
+            end_loc = MAX(end_loc, (others_req[i].offsets[j] + others_req[i].lens[j] - 1));
+        }
+    }
+// if (myrank == 0) printf("%s at %d: end_loc=%lld nbufs=%d recv_list=%s\n",__func__,__LINE__, end_loc,nbufs,(recv_list==NULL)?"NULL":"NOT NULL");
+
+    /* Allocate displacement-length pair arrays, describing the send buffer.
+     * send_list[i].count: number displacement-length pairs.
+     * send_list[i].len: length in bytes.
+     * send_list[i].disp: displacement (send buffer address).
+     */
+    send_list = (disp_len_list*) NCI_Malloc(sizeof(disp_len_list) * cb_nodes);
+    for (i = 0; i < cb_nodes; i++) {
+        send_list[i].count = 0;
+#ifdef HAVE_MPI_LARGE_COUNT
+        alloc_sz = sizeof(MPI_Count) * 2;
+        send_list[i].disp = (MPI_Count*) NCI_Malloc(alloc_sz * nbufs);
+        send_list[i].len  = send_list[i].disp + nbufs;
+#else
+        alloc_sz = sizeof(MPI_Aint) + sizeof(int);
+        send_list[i].disp = (MPI_Aint*) NCI_Malloc(alloc_sz * nbufs);
+        send_list[i].len  = (int*) (send_list[i].disp + nbufs);
+#endif
+    }
+
+    /* end_loc >= 0 indicates this process has something to write to the file.
+     * Only I/O aggregators can have end_loc > 0. write_buf is the collective
+     * buffer and only matter for I/O aggregators. recv_buf is the buffer used
+     * only by aggregators to receive requests from non-aggregators. Its size
+     * may be larger then the file stripe size, in case when writes from
+     * non-aggregators overlap. In this case, it will be realloc-ed in
+     * LUSTRE_W_Exchange_data(). The received data is later copied over to
+     * write_buf, whose contents will be written to file.
+     */
+    if (end_loc >= 0 && nbufs > 0) {
+        /* Allocate displacement-length pair arrays, describing the recv buffer.
+         * recv_list[i].count: number displacement-length pairs.
+         * recv_list[i].len: length in bytes.
+         * recv_list[i].disp: displacement (recv buffer address).
+         */
+        assert(fd->is_agg);
+
+        recv_list = (disp_len_list*) NCI_Malloc(sizeof(disp_len_list) * nprocs);
+        for (i = 0; i < nprocs; i++) {
+            recv_list[i].count = 0;
+#ifdef HAVE_MPI_LARGE_COUNT
+            alloc_sz = sizeof(MPI_Count) * 2;
+            recv_list[i].disp = (MPI_Count*) NCI_Malloc(alloc_sz * nbufs);
+            recv_list[i].len  = recv_list[i].disp + nbufs;
+#else
+            alloc_sz = sizeof(MPI_Aint) + sizeof(int);
+            recv_list[i].disp = (MPI_Aint*) NCI_Malloc(alloc_sz * nbufs);
+            recv_list[i].len  = (int*) (recv_list[i].disp + nbufs);
+#endif
+        }
+
+        /* collective buffer was allocated at file open/create. For Lustre, its
+         * size must be at least striping_unit, which has been checked at the
+         * time fd->io_buf is allocated.
+         */
+        assert(fd->io_buf != NULL);
+
+        /* divide collective buffer into nbufs sub-buffers */
+        write_buf = (char **) NCI_Malloc(nbufs * sizeof(char*));
+        write_buf[0] = fd->io_buf;
+
+        /* Similarly, receive buffer consists of nbufs sub-buffers */
+        recv_buf = (char **) NCI_Malloc(nbufs * sizeof(char*));
+        recv_buf[0] = (char *) NCI_Malloc(striping_unit);
+
+        /* recv_count[j][i] is the number of off-len pairs to be received from
+         * each proc i in round j
+         */
+        recv_count    = (MPI_Count**) NCI_Malloc(3 * nbufs * sizeof(MPI_Count*));
+        recv_count[0] = (MPI_Count*)  NCI_Malloc(3 * nbufs * nprocs * sizeof(MPI_Count));
+
+        /* recv_size[j][i] is the receive size from proc i in round j */
+        recv_size = recv_count + nbufs;
+        recv_size[0] = recv_count[0] + nbufs * nprocs;
+
+        /* recv_start_pos[j][i] is the starting index of offset-length arrays
+         * pointed by others_req[i].curr for remote rank i in round j
+         */
+        recv_start_pos = recv_size + nbufs;
+        recv_start_pos[0] = recv_size[0] + nbufs * nprocs;
+
+        for (j = 1; j < nbufs; j++) {
+            write_buf[j] = write_buf[j-1] + striping_unit;
+            /* recv_buf[j] may be realloc in LUSTRE_W_Exchange_data() */
+            recv_buf[j]       = (char *) NCI_Malloc(striping_unit);
+            recv_count[j]     = recv_count[j-1]     + nprocs;
+            recv_size[j]      = recv_size[j-1]      + nprocs;
+            recv_start_pos[j] = recv_start_pos[j-1] + nprocs;
+        }
+
+        /* srt_off_len consists of file offset-length pairs sorted in a
+         * monotonically non-decreasing order (required by MPI-IO standard)
+         * which is used when writing to the file
+         */
+        srt_off_len = (off_len_list*) NCI_Malloc(nbufs * sizeof(off_len_list));
+    }
+
+    /* send_buf[] will be allocated in LUSTRE_W_Exchange_data(), when the use
+     * buffer is not contiguous.
+     */
+    send_buf = (char **) NCI_Malloc(nbufs * sizeof(char*));
+
+    /* this_buf_idx contains indices to the user write buffer for sending this
+     * rank's write data to aggregators, one for each aggregator. It is used
+     * only when user buffer is contiguous.
+     */
+    if (buf_view.is_contig)
+        this_buf_idx = (MPI_Offset *) NCI_Malloc(sizeof(MPI_Offset) * cb_nodes);
+
+    /* array of data sizes to be sent to each aggregator in a 2-phase round */
+    send_size = (MPI_Count *) NCI_Calloc(cb_nodes, sizeof(MPI_Count));
+
+    /* min_st_loc is the beginning file offsets of the aggregate access region
+     *     of this collective write, and it has been downward aligned to the
+     *     nearest file stripe boundary
+     * iter_end_off is the ending file offset of aggregate write region of
+     *     iteration m, upward aligned to the file stripe boundary.
+     */
+    iter_end_off = min_st_loc + step_size;
+
+    ibuf = 0;
+    for (m = 0; m < ntimes; m++) {
+        MPI_Count range_size;
+        MPI_Offset range_off;
+
+        /* Note that MPI standard (MPI 3.1 Chapter 13.1.1 and MPI 4.0 Chapter
+         * 14.1.1) requires that the typemap displacements of etype and
+         * filetype are non-negative and monotonically non-decreasing. This
+         * simplifies implementation a bit compared to reads.
+         */
+
+        /* Calculate what should be communicated.
+         *
+         * First, calculate the amount to be sent to each aggregator i, at this
+         * round m, by going through all offset-length pairs in my_req[i].
+         *
+         * iter_end_off - ending file offset of aggregate write region of this
+         *                round, and upward aligned to the file stripe
+         *                boundary. Note the aggregate write region of this
+         *                round starts from (iter_end_off-step_size) to
+         *                iter_end_off, aligned with file stripe boundaries.
+         * send_size[i] - total size in bytes of this process's write data
+         *                fall into aggregator i's FD in this round.
+         * recv_size[m][i] - size in bytes of data to be received by this
+         *                aggregator from process i in round m.
+         * recv_count[m][i] - number of noncontiguous offset-length pairs from
+         *                process i fall into this aggregator's write region
+         *                in round m.
+         */
+        for (i = 0; i < cb_nodes; i++) {
+            /* reset communication metadata to all 0s for this round */
+            send_size[i] = 0;
+
+            if (my_req[i].count == 0) continue;
+            /* my_req[i].count is the number of this rank's offset-length pairs
+             * to be sent to aggregator i
+             */
+
+            if (my_req[i].curr == my_req[i].count)
+                continue; /* done with aggregator i */
+
+            if (buf_view.is_contig)
+                /* buf_idx is used only when user buffer is contiguous.
+                 * this_buf_idx[i] points to the starting offset of user
+                 * buffer, buf, for amount of send_size[i] to be sent to
+                 * aggregator i at this round.
+                 */
+                this_buf_idx[i] = buf_idx[i][my_req[i].curr];
+
+            /* calculate the send amount from this rank to aggregator i */
+            for (j = my_req[i].curr; j < my_req[i].count; j++) {
+                if (my_req[i].offsets[j] < iter_end_off)
+                    send_size[i] += my_req[i].lens[j];
+                else
+                    break;
+            }
+
+            /* update my_req[i].curr to point to the jth offset-length
+             * pair of my_req[i], which will be used as the first pair in the
+             * next round of iteration.
+             */
+            my_req[i].curr = j;
+        }
+
+        /* range_off is the starting file offset of this aggregator's write
+         *     region at this round (may not be aligned to stripe boundary).
+         * range_size is the size (in bytes) of this aggregator's write region
+         *     for this round (whose size is always <= striping_unit).
+         */
+        range_off = off_list[m];
+        range_size = MIN(striping_unit - range_off % striping_unit,
+                         end_loc - range_off + 1);
+
+        /* Calculate the amount to be received from each process i at this
+         * round, by going through all offset-length pairs of others_req[i].
+         */
+        if (recv_count != NULL) {
+            for (i=0; i<nprocs; i++) {
+                /* reset communication metadata to all 0s for this round */
+                recv_count[ibuf][i] = recv_size[ibuf][i] = 0;
+                recv_start_pos[ibuf][i] = 0;
+
+                if (others_req[i].count == 0) continue;
+
+                recv_start_pos[ibuf][i] = others_req[i].curr;
+                for (j = others_req[i].curr; j < others_req[i].count; j++) {
+                    if (others_req[i].offsets[j] < iter_end_off) {
+                        recv_count[ibuf][i]++;
+                        others_req[i].mem_ptrs[j] = others_req[i].offsets[j]
+                                                  - range_off;
+                        recv_size[ibuf][i] += others_req[i].lens[j];
+                    } else {
+                        break;
+                    }
+                }
+                /* update others_req[i].curr to point to the jth offset-length
+                 * pair of others_req[i], which will be used as the first pair
+                 * in the next round of iteration.
+                 */
+                others_req[i].curr = j;
+            }
+        }
+        iter_end_off += step_size;
+
+        /* exchange phase - each process sends it's write data to I/O
+         * aggregators and aggregators receive from non-aggregators.
+         * Communication are MPI_Issend and MPI_Irecv only. There is no
+         * collective communication. Only aggregators have non-NULL write_buf
+         * and recv_buf. All processes have non-NULL send_buf.
+         */
+        char *wbuf = (write_buf == NULL) ? NULL : write_buf[ibuf];
+
+        /* Exchange_data_recv() and Exchange_data_send() below perform one
+         * round of communication phase and there are ntimes rounds.
+         */
+// printf("%s at %d: end_loc=%lld nbufs=%d recv_list=%s\n",__func__,__LINE__, end_loc,nbufs,(recv_list==NULL)?"NULL":"NOT NULL");
+        if (recv_list != NULL) { /* this aggregator has something to received */
+            char *rbuf = (recv_buf  == NULL) ? NULL :  recv_buf[ibuf];
+            int err;
+
+            err = Exchange_data_recv(fd,
+                               buf,                /* IN: user buffer */
+                               wbuf,               /* OUT: write buffer */
+                               &rbuf,              /* OUT: receive buffer */
+                               &buf_view,
+                               recv_size[ibuf],     /* IN: changed each round */
+                               range_off,           /* IN: changed each round */
+                               range_size,          /* IN: changed each round */
+                               recv_count[ibuf],    /* IN: changed each round */
+                               recv_start_pos[ibuf],/* IN: changed each round */
+                               others_req,          /* IN: changed each round */
+                               this_buf_idx,        /* IN: changed each round */
+                               &srt_off_len[ibuf],/* OUT: write off-len pairs */
+                               recv_list);        /* OUT: recv disp-len pairs */
+            if (err != NC_NOERR)
+                goto over;
+
+            /* rbuf might be realloc-ed */
+            if (recv_buf != NULL) recv_buf[ibuf] = rbuf;
+        }
+
+        /* sender part */
+        MPI_Count self_count, self_start_pos;
+        if (recv_count == NULL) {
+            self_count = 0;
+            self_start_pos = 0;
+        }
+        else {
+            self_count     = recv_count[ibuf][myrank];
+            self_start_pos = recv_start_pos[ibuf][myrank];
+        }
+        send_buf[ibuf] = NULL;
+
+        Exchange_data_send(fd,
+                           buf,             /* IN: user buffer */
+                           wbuf,            /* OUT: write buffer */
+                           &send_buf[ibuf], /* OUT: send buffer */
+                           &buf_view,
+                           send_size,       /* IN: changed each round */
+                           self_count,
+                           self_start_pos,
+                           others_req,      /* IN: changed each round */
+                           this_buf_idx,    /* IN: changed each round */
+                           send_list);      /* OUT: send disp-len pairs */
+
+        if (m % nbufs < nbufs - 1 && m < ntimes - 1) {
+            /* continue to the next round */
+            ibuf++;
+        }
+        else {
+            /* commit communication and write this batch of numBufs to file */
+            int numBufs = ibuf + 1;
+
+// printf("%s at %d: m=%d nbufs=%d ntimes=%d\n",__func__,__LINE__, m,nbufs,ntimes);
+            /* reset ibuf to the first element of nbufs */
+            ibuf = 0;
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+            double curT = MPI_Wtime();
+#endif
+            /* communication phase */
+            commit_comm_phase(fd, send_list, recv_list);
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+            if (fd->is_agg) fd->write_timing[3] += MPI_Wtime() - curT;
+#endif
+
+            /* free send_buf allocated in LUSTRE_W_Exchange_data() */
+            for (j = 0; j < numBufs; j++) {
+                if (send_buf[j] != NULL) {
+                    NCI_Free(send_buf[j]);
+                    send_buf[j] = NULL;
+                }
+            }
+            if (!fd->is_agg) /* non-aggregators are done for this batch */
+                continue;
+
+            if (recv_list == NULL) /*  this aggregator has nothing to write */
+                continue;
+
+            /* this aggregator unpacks the data in recv_buf[] into write_buf */
+            if (end_loc >= 0) {
+                for (j = 0; j < numBufs; j++) {
+                    char *buf_ptr = recv_buf[j];
+                    for (i = 0; i < nprocs; i++) {
+                        if (recv_count[j][i] > 1 && i != myrank) {
+                            /* When recv_count[j][i] == 1, this case has
+                             * been taken care of earlier by receiving the
+                             * message directly into write_buf.
+                             */
+                            MEMCPY_UNPACK(i, buf_ptr, recv_start_pos[j][i],
+                                          recv_count[j][i], write_buf[j]);
+                            buf_ptr += recv_size[j][i];
+                        }
+                    }
+                }
+            }
+
+            /* this aggregator writes to numBufs number of stripes */
+            for (j=0; j<numBufs; j++) {
+
+                /* if there is no data to write in round (batch_idx + j) */
+                if (srt_off_len[j].num == 0)
+                    continue;
+
+                /* range_off  starting file offset of this aggregator's write
+                 *            region for this round (may not be aligned to
+                 *            stripe boundary)
+                 * range_size size (in bytes) of this rank's write region for
+                 *            this round, <= striping_unit
+                 */
+                range_off = off_list[batch_idx + j];
+                range_size = MIN(striping_unit - range_off % striping_unit,
+                                 end_loc - range_off + 1);
+
+                /* When srt_off_len[j].num == 1, either there is no hole in the
+                 * write buffer or the file domain has been read-modify-written
+                 * with the received write data. When srt_off_len[j].num > 1,
+                 * data sieving is not performed and holes have been found. In
+                 * this case, srt_off_len[] is the list of sorted offset-length
+                 * pairs describing noncontiguous writes. Now call writes for
+                 * each offset-length pair. Note the offset-length pairs
+                 * (represented by srt_off_len[j].off, srt_off_len[j].len, and
+                 * srt_off_len[j].num) have been coalesced in
+                 * LUSTRE_W_Exchange_data().
+                 */
+// printf("%s at %d: num=%d\n",__func__,__LINE__, srt_off_len[j].num);
+                for (i = 0; i < srt_off_len[j].num; i++) {
+                    /* all write requests in this round should fall into file
+                     * range of [range_off, range_off+range_size). This below
+                     * assertion should never fail.
+                     */
+                    assert(srt_off_len[j].off[i] < range_off + range_size &&
+                           srt_off_len[j].off[i] >= range_off);
+
+// printf("%s at %d: PNCIO_WriteContig num=%d [%d] off=%lld len=%lld\n",__func__,__LINE__, srt_off_len[j].num,i,srt_off_len[j].off[i],srt_off_len[j].len[i]);
+                    w_len = PNCIO_WriteContig(fd,
+                                     write_buf[j] + (srt_off_len[j].off[i] - range_off),
+                                     srt_off_len[j].len[i],
+                                     srt_off_len[j].off[i]);
+                    if (w_len < 0) goto over;
+                    total_w_len += w_len;
+                }
+                if (srt_off_len[j].num > 0) {
+                    NCI_Free(srt_off_len[j].off);
+                    srt_off_len[j].num = 0;
+                }
+            }
+            batch_idx += numBufs; /* only matters for aggregators */
+        }
+    }
+
+  over:
+    if (srt_off_len)
+        NCI_Free(srt_off_len);
+    if (write_buf != NULL)
+        NCI_Free(write_buf);
+    if (recv_buf != NULL) {
+        for (j = 0; j < nbufs; j++)
+            NCI_Free(recv_buf[j]);
+        NCI_Free(recv_buf);
+    }
+    if (recv_count != NULL) {
+        NCI_Free(recv_count[0]);
+        NCI_Free(recv_count);
+    }
+    NCI_Free(send_size);
+    NCI_Free(off_list);
+    if (buf_view.is_contig)
+        NCI_Free(this_buf_idx);
+    if (send_buf != NULL)
+        NCI_Free(send_buf);
+    if (send_list != NULL) {
+        for (i = 0; i < cb_nodes; i++)
+            NCI_Free(send_list[i].disp);
+        NCI_Free(send_list);
+    }
+    if (recv_list != NULL) {
+        for (i = 0; i < nprocs; i++)
+            NCI_Free(recv_list[i].disp);
+        NCI_Free(recv_list);
+    }
+
+#ifdef WKL_DEBUG
+    /* check any pending messages to be received */
+    MPI_Status probe_st;
+    int probe_flag;
+    MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, fd->comm, &probe_flag, &probe_st);
+    if (probe_flag) {
+        printf("ERROR ++++ MPI_Iprobe rank=%4d is_agg=%d: ---- cb_nodes=%d ntimes=%lld nbufs=%d\n",myrank,fd->is_agg,cb_nodes,ntimes,nbufs);
+        fflush(stdout);
+    }
+#endif
+    return total_w_len;
+}
+
+/* This heap-merge sort also coalesces sorted offset-length pairs whenever
+ * possible.
+ *
+ * Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 modified for a
+ * heap with smallest element at root. The recursion has been removed so that
+ * there are no function calls. Function calls are too expensive.
+ */
+static
+void heap_merge(const PNCIO_Access *others_req,
+                const MPI_Count    *count,
+#ifdef HAVE_MPI_LARGE_COUNT
+                MPI_Count          *srt_off,
+                MPI_Count          *srt_len,
+#else
+                MPI_Offset         *srt_off,
+                int                *srt_len,
+#endif
+                const MPI_Count    *start_pos,
+                int                 nprocs,
+                int                 nprocs_recv,
+                MPI_Count          *total_elements)
+{
+    typedef struct {
+        MPI_Offset *off_list;
+#ifdef HAVE_MPI_LARGE_COUNT
+        MPI_Offset *len_list;
+#else
+        int *len_list;
+#endif
+        MPI_Count nelem;
+    } heap_struct;
+
+    heap_struct *a, tmp;
+    int i, j, heapsize, l, r, k, smallest;
+
+    a = (heap_struct *) NCI_Malloc((nprocs_recv + 1) * sizeof(heap_struct));
+
+    j = 0;
+    for (i = 0; i < nprocs; i++) {
+        if (count[i]) {
+            a[j].off_list = others_req[i].offsets + start_pos[i];
+            a[j].len_list = others_req[i].lens + start_pos[i];
+            a[j].nelem = count[i];
+            j++;
+        }
+    }
+
+#define SWAP(x, y, tmp) { tmp = x ; x = y ; y = tmp ; }
+
+    heapsize = nprocs_recv;
+
+    /* Build a heap out of the first element from each list, with the smallest
+     * element of the heap at the root. The first for loop is to find and move
+     * the smallest a[*].off_list[0] to a[0].
+     */
+    for (i = heapsize / 2 - 1; i >= 0; i--) {
+        k = i;
+        for (;;) {
+            r = 2 * (k + 1);
+            l = r - 1;
+            if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
+                smallest = l;
+            else
+                smallest = k;
+
+            if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list)))
+                smallest = r;
+
+            if (smallest != k) {
+                SWAP(a[k], a[smallest], tmp);
+                k = smallest;
+            } else
+                break;
+        }
+    }
+
+    /* The heap keeps the smallest element in its first element, i.e.
+     * a[0].off_list[0].
+     */
+    j = 0;
+    for (i = 0; i < *total_elements; i++) {
+        /* extract smallest element from heap, i.e. the root */
+        if (j == 0 || srt_off[j - 1] + srt_len[j - 1] < *(a[0].off_list)) {
+            srt_off[j] = *(a[0].off_list);
+            srt_len[j] = *(a[0].len_list);
+            j++;
+        } else {
+            /* this offset-length pair can be coalesced into the previous one */
+            srt_len[j - 1] = *(a[0].off_list) + *(a[0].len_list) - srt_off[j - 1];
+        }
+        (a[0].nelem)--;
+
+        if (a[0].nelem) {
+            (a[0].off_list)++;
+            (a[0].len_list)++;
+        } else {
+            a[0] = a[heapsize - 1];
+            heapsize--;
+        }
+
+        /* Heapify(a, 0, heapsize); */
+        k = 0;
+        for (;;) {
+            r = 2 * (k + 1);
+            l = r - 1;
+            if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
+                smallest = l;
+            else
+                smallest = k;
+
+            if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list)))
+                smallest = r;
+
+            if (smallest != k) {
+                SWAP(a[k], a[smallest], tmp);
+                k = smallest;
+            } else
+                break;
+        }
+    }
+    NCI_Free(a);
+    *total_elements = j;
+}
+
+#define CACHE_REQ(list, nelems, buf) {   \
+    MPI_Aint buf_addr;                   \
+    list.len[list.count] = nelems;       \
+    MPI_Get_address(buf, &buf_addr);     \
+    list.disp[list.count] = buf_addr;    \
+    list.count++;                        \
+}
+
+static
+int Exchange_data_recv(
+          PNCIO_File     *fd,
+    const void           *buf,         /* user buffer */
+          char           *write_buf,   /* OUT: internal buffer used to write
+                                        * to file */
+          char          **recv_buf,    /* OUT: [nbufs] internal buffer used to
+                                        * receive from other processes */
+    const PNCIO_View     *buf_view,    /* IN: flattened buffer
+                                        * offset-length pairs */
+    const MPI_Count      *recv_size,   /* [nprocs] recv_size[i] is amount of
+                                        * this aggregator recv from rank i */
+          MPI_Offset      range_off,   /* starting file offset of this
+                                        * aggregator's write region */
+          MPI_Count       range_size,  /* amount of this aggregator's write
+                                        * region */
+    const MPI_Count      *recv_count,  /* [nprocs] recv_count[i] is the number
+                                        * of offset-length pairs received from
+                                        * rank i */
+    const MPI_Count      *start_pos,   /* [nprocs] start_pos[i] starting value
+                                        * of others_req[i].curr */
+    const PNCIO_Access   *others_req,  /* [nprocs] others_req[i] is rank i's
+                                        * write requests fall into this
+                                        * aggregator's file domain */
+    const MPI_Offset     *buf_idx,      /* [cb_nodes] indices to user buffer
+                                        * offsets for sending this rank's
+                                        * write data to aggregator i */
+          off_len_list   *srt_off_len, /* OUT: list of write offset-length
+                                        * pairs of this aggregator */
+          disp_len_list  *recv_list)   /* OUT: displacement-length pairs of
+                                        * recv buffer */
+{
+    char *buf_ptr, *contig_buf;
+    size_t alloc_sz;
+    int i, j, nprocs, myrank, nprocs_recv, hole, build_srt_off_len;
+    MPI_Count sum_recv;
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+
+    /* srt_off_len contains the file offset-length pairs to be written by this
+     * aggregator at this round. The file region starts from range_off with
+     * size of range_size.
+     */
+
+    srt_off_len->num = 0;
+    srt_off_len->off = NULL;
+    sum_recv = 0;
+    nprocs_recv = 0;
+
+    /* calculate receive metadata */
+    j = -1;
+    for (i = 0; i < nprocs; i++) {
+        srt_off_len->num += recv_count[i];
+        if (j == -1 && recv_count[i] > 0) j = i;
+        sum_recv += recv_size[i];
+        if (recv_size[i])
+            nprocs_recv++;
+    }
+
+    if (nprocs_recv == 0) return NC_NOERR;
+
+// MPI_Count numx = srt_off_len->num; printf("nprocs_recv=%d PNCIO_DS_WR_NAGGRS_LB=%d srt_off_len->num=%lld PNCIO_DS_WR_NPAIRS_LB=%d\n",nprocs_recv,PNCIO_DS_WR_NAGGRS_LB,srt_off_len->num,PNCIO_DS_WR_NPAIRS_LB);
+
+    /* determine whether checking holes is necessary */
+    if (srt_off_len->num == 0) {
+        /* this process has nothing to receive and hence no hole */
+        build_srt_off_len = 0;
+        hole = 0;
+    } else if (srt_off_len->num == 1) {
+        build_srt_off_len = 0;
+        hole = 0;
+#ifdef HAVE_MPI_LARGE_COUNT
+        alloc_sz = sizeof(MPI_Offset) + sizeof(MPI_Count);
+        srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz);
+        srt_off_len->len = (MPI_Count*) (srt_off_len->off + 1);
+#else
+        alloc_sz = sizeof(MPI_Offset) + sizeof(int);
+        srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz);
+        srt_off_len->len = (int*) (srt_off_len->off + 1);
+#endif
+        srt_off_len->off[0] = others_req[j].offsets[start_pos[j]];
+        srt_off_len->len[0] = others_req[j].lens[start_pos[j]];
+    } else if (fd->hints->ds_write == PNCIO_HINT_ENABLE) {
+        /* skip building of srt_off_len and proceed to read-modify-write */
+        build_srt_off_len = 0;
+        /* assuming there are holes */
+        hole = 1;
+    } else if (fd->hints->ds_write == PNCIO_HINT_AUTO) {
+        if (DO_HEAP_MERGE(nprocs_recv, srt_off_len->num)) {
+            /* When the number of sorted offset-length lists or the total
+             * number of offset-length pairs are too large, the heap-merge sort
+             * below for building srt_off_len can become very expensive. Such
+             * sorting is also used to check holes to determine whether
+             * read-modify-write is necessary.
+             */
+            build_srt_off_len = 0;
+            /* assuming there are holes */
+            hole = 1;
+        }
+        else /* heap-merge is less expensive, proceed to build srt_off_len */
+            build_srt_off_len = 1;
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        if (build_srt_off_len) {
+            fd->write_counter[1]++;
+            fd->write_counter[2] = MAX(fd->write_counter[2], srt_off_len->num);
+            fd->write_counter[3] = MAX(fd->write_counter[3], nprocs_recv);
+        } else {
+            fd->write_counter[4]++;
+            fd->write_counter[5] = MAX(fd->write_counter[5], srt_off_len->num);
+            fd->write_counter[6] = MAX(fd->write_counter[6], nprocs_recv);
+        }
+#endif
+    } else { /* if (fd->hints->ds_write == PNCIO_HINT_DISABLE) */
+        /* User explicitly disable data sieving to skip read-modify-write.
+         * Whether or not there is a hole is not important. However,
+         * srt_off_len must be constructed to merge all others_req[] into a
+         * single sorted list. This step is necessary because after this
+         * subroutine returns, write data from all non-aggregators will be
+         * packed into the write_buf, with a possibility of overlaps, and
+         * as srt_off_len stores the coalesced offset-length pairs of
+         * individual non-contiguous write requests, it is used to write them
+         * to the file.
+         */
+        build_srt_off_len = 1;
+    }
+
+    if (build_srt_off_len) {
+        /* merge all the offset-length pairs from others_req[] (already sorted
+         * individually) into a single list of offset-length pairs.
+         */
+#ifdef HAVE_MPI_LARGE_COUNT
+        alloc_sz = sizeof(MPI_Offset) + sizeof(MPI_Count);
+        srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz * srt_off_len->num);
+        srt_off_len->len = (MPI_Count*) (srt_off_len->off + srt_off_len->num);
+#else
+        alloc_sz = sizeof(MPI_Offset) + sizeof(int);
+        srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz * srt_off_len->num);
+        srt_off_len->len = (int*) (srt_off_len->off + srt_off_len->num);
+#endif
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        double curT = MPI_Wtime();
+#endif
+        heap_merge(others_req, recv_count, srt_off_len->off, srt_off_len->len,
+                   start_pos, nprocs, nprocs_recv, &srt_off_len->num);
+
+        /* Now, (srt_off_len->off and srt_off_len->len) are in an increasing
+         * order of file offsets. In addition, they are coalesced.
+         */
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        fd->write_timing[5] += MPI_Wtime() - curT;
+#endif
+        /* whether or not there are holes */
+        hole = (srt_off_len->num > 1);
+    }
+
+// printf("%s at %d: ds_write=%s build_srt_off_len=%d hole=%d skip_read=%d srt_off_len->num=%lld\n",__func__,__LINE__, (fd->hints->ds_write == PNCIO_HINT_ENABLE)?"ENABLE": (fd->hints->ds_write == PNCIO_HINT_DISABLE)?"DISABLE":"AUTO", build_srt_off_len,hole,fd->skip_read,srt_off_len->num);
+// printf("%s at %d: ds_write=%s build_srt_off_len=%d hole=%d nprocs_recv=%d(PNCIO_DS_WR_NAGGRS_LB=%d) numx=%lld(PNCIO_DS_WR_NPAIRS_LB=%d)\n",__func__,__LINE__, (fd->hints->ds_write == PNCIO_HINT_ENABLE)?"ENABLE": (fd->hints->ds_write == PNCIO_HINT_DISABLE)?"DISABLE":"AUTO", build_srt_off_len,hole,nprocs_recv,PNCIO_DS_WR_NAGGRS_LB,numx,PNCIO_DS_WR_NPAIRS_LB);
+
+    /* data sieving */
+    if (fd->hints->ds_write != PNCIO_HINT_DISABLE && hole) {
+        if (fd->skip_read)
+            memset(write_buf, 0, range_size);
+        else {
+            MPI_Offset r_len;
+            r_len = PNCIO_ReadContig(fd, write_buf, range_size, range_off);
+            if (r_len < 0) return (int)r_len;
+        }
+
+        /* Once read, holes have been filled and thus the number of
+         * offset-length pairs, srt_off_len->num, becomes one.
+         */
+        srt_off_len->num = 1;
+        if (srt_off_len->off == NULL) { /* if has not been malloc-ed yet */
+#ifdef HAVE_MPI_LARGE_COUNT
+            alloc_sz = sizeof(MPI_Offset) + sizeof(MPI_Count);
+            srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz);
+            srt_off_len->len = (MPI_Count*) (srt_off_len->off + 1);
+#else
+            alloc_sz = sizeof(MPI_Offset) + sizeof(int);
+            srt_off_len->off = (MPI_Offset*) NCI_Malloc(alloc_sz);
+            srt_off_len->len = (int*) (srt_off_len->off + 1);
+#endif
+        }
+        srt_off_len->off[0] = range_off;
+        srt_off_len->len[0] = range_size;
+    }
+
+    /* It is possible sum_recv (sum of message sizes to be received) is larger
+     * than the size of collective buffer, write_buf, if writes from multiple
+     * remote processes overlap. Receiving messages into overlapped regions of
+     * the same write_buffer may cause a problem. To avoid it, we allocate a
+     * temporary buffer big enough to receive all messages into disjointed
+     * regions. Earlier in LUSTRE_Exch_and_write(), write_buf is already
+     * allocated with twice amount of the file stripe size, with the second
+     * half to be used to receive messages. If sum_recv is smaller than file
+     * stripe size, we can reuse that space. But if sum_recv is bigger (an
+     * overlap case, which is rare), we allocate a separate buffer of size
+     * sum_recv.
+     */
+    sum_recv -= recv_size[myrank];
+    if (sum_recv > fd->hints->striping_unit)
+        *recv_buf = (char *) NCI_Realloc(*recv_buf, sum_recv);
+    contig_buf = *recv_buf;
+
+    /* cache displacement-length pairs of receive buffer */
+    buf_ptr = contig_buf;
+    for (i = 0; i < nprocs; i++) {
+        if (recv_size[i] == 0)
+            continue;
+        if (i != myrank) {
+            if (recv_count[i] > 1) {
+                CACHE_REQ(recv_list[i], recv_size[i], buf_ptr)
+                buf_ptr += recv_size[i];
+            } else {
+                /* recv_count[i] is the number of noncontiguous offset-length
+                 * pairs describing the write requests of rank i that fall
+                 * into this aggregator's file domain. When recv_count[i] is 1,
+                 * there is only one such pair, meaning the receive message is
+                 * to be stored contiguously. Such message can be received
+                 * directly into write_buf.
+                 */
+                CACHE_REQ(recv_list[i], recv_size[i],
+                          write_buf + others_req[i].mem_ptrs[start_pos[i]])
+            }
+        } else if (buf_view->is_contig && recv_count[i] > 0) {
+            /* send/recv to/from self uses memcpy(). The case when buftype is
+             * not contiguous will be handled later in Exchange_data_send().
+             */
+            char *fromBuf = (char *) buf + buf_idx[fd->my_cb_nodes_index];
+            MEMCPY_UNPACK(i, fromBuf, start_pos[i], recv_count[i], write_buf);
+        }
+    }
+    return NC_NOERR;
+}
+
+static
+void Exchange_data_send(
+          PNCIO_File     *fd,
+    const void           *buf,          /* user buffer */
+          char           *write_buf,    /* OUT: internal buffer used to write
+                                         * to file, only matter when send to
+                                         * self */
+          char          **send_buf_ptr, /* OUT: [cb_nodes] point to internal
+                                         * send buffer */
+          PNCIO_View     *buf_view,     /* IN/OUT: flattened buffer
+                                         * offset-length pairs */
+    const MPI_Count      *send_size,    /* [cb_nodes] send_size[i] is amount of
+                                         * this rank sent to aggregator i */
+          MPI_Count       self_count,   /* No. offset-length pairs sent to self
+                                         * rank */
+          MPI_Count       start_pos,    /* others_req[myrank].curr */
+    const PNCIO_Access   *others_req,   /* [nprocs] only used when send to self,
+                                         * others_req[myrank] */
+    const MPI_Offset     *buf_idx,      /* [cb_nodes] indices to user buffer
+                                         * for sending this rank's write data
+                                         * to aggregator i */
+          disp_len_list  *send_list)    /* OUT: displacement-length pairs of
+                                         * send buffer */
+{
+    int i, myrank, cb_nodes;
+
+    *send_buf_ptr = NULL;
+
+    MPI_Comm_rank(fd->comm, &myrank);
+
+    cb_nodes = fd->hints->cb_nodes;
+// if (myrank==0) printf("%s at %d: cb_nodes=%d\n",__func__,__LINE__, cb_nodes);
+    if (buf_view->is_contig) {
+        /* If buftype is contiguous, data can be directly sent from user buf
+         * at location given by buf_idx.
+         */
+        for (i = 0; i < cb_nodes; i++) {
+// if (myrank==0 && send_size[i]) printf("%s at %d: cb_nodes=%d send_size[%d]=%lld my_cb_nodes_index=%d\n",__func__,__LINE__, cb_nodes,i,send_size[i],fd->my_cb_nodes_index);
+            if (send_size[i] && i != fd->my_cb_nodes_index)
+                CACHE_REQ(send_list[i], send_size[i], (char*)buf + buf_idx[i]);
+        }
+    } else {
+        char **send_buf, *self_buf;
+
+        /* total send size of this round */
+        size_t send_total_size = 0;
+        for (i = 0; i < cb_nodes; i++)
+            send_total_size += send_size[i];
+
+        if (send_total_size == 0) return;
+
+        /* The user buffer to be used to send in this round is not contiguous,
+         * allocate send_buf[], a contiguous space, copy data to send_buf,
+         * including ones to be sent to self, and then use send_buf to send.
+         */
+        send_buf = (char **) NCI_Malloc(cb_nodes * sizeof(char *));
+        send_buf[0] = (char *) NCI_Malloc(send_total_size);
+        for (i = 1; i < cb_nodes; i++)
+            send_buf[i] = send_buf[i - 1] + send_size[i - 1];
+
+        LUSTRE_Fill_send_buffer(fd, buf, buf_view, send_buf,
+                                send_total_size, send_size, &self_buf,
+                                send_list);
+        /* Send buffers must not be touched before MPI_Waitall() is completed,
+         * and thus send_buf will be freed in LUSTRE_Exch_and_write()
+         */
+
+        if (fd->my_cb_nodes_index >= 0 && send_size[fd->my_cb_nodes_index] > 0) {
+            /* contents of user buf that must be sent to self has been copied
+             * into send_buf[fd->my_cb_nodes_index]. Now unpack it into
+             * write_buf.
+             */
+            if (self_buf == NULL) self_buf = send_buf[fd->my_cb_nodes_index];
+            MEMCPY_UNPACK(myrank, self_buf, start_pos, self_count, write_buf);
+        }
+
+        *send_buf_ptr = send_buf[0];
+        NCI_Free(send_buf);
+    }
+}
+
+static void LUSTRE_Fill_send_buffer(PNCIO_File       *fd,
+                                    const void       *buf,
+                                    PNCIO_View       *buf_view, /* IN/OUT */
+                                    char            **send_buf,
+                                    size_t            send_total_size,
+                                    const MPI_Count  *send_size,
+                                    char            **self_buf,
+                                    disp_len_list    *send_list)
+{
+    /* this function is only called if buftype is not contiguous */
+    int q, first_q=-1, isUserBuf=0;
+    MPI_Count send_size_rem=0, size, copy_size=0;
+    char *user_buf_ptr=NULL, *send_buf_ptr=NULL, *same_buf_ptr=NULL;
+    MPI_Offset off, user_buf_idx;
+#ifdef HAVE_MPI_LARGE_COUNT
+    MPI_Offset len, rem_len;
+#else
+    int len, rem_len;
+#endif
+
+#ifdef WKL_DEBUG
+int num_memcpy=0;
+#endif
+
+    *self_buf = NULL;
+
+    /* user_buf_idx is to the index offset to buf, indicating the starting
+     * location to be copied.
+     *
+     * buf_view stores the offset-length pairs of the flattened user buffer
+     *     data type. Note this stores offset-length pairs of the data type,
+     *     and write amount can be a multiple of the data type.
+     * buf_view.count: the number of pairs
+     * buf_view.off[i]: the ith pair's byte offset to buf. Note the
+     *     flattened offsets of user buffer type may not be sorted in an
+     *     increasing order, unlike fileview which is required by MPI to be
+     *     sorted in a monotonically non-decreasing order.
+     * buf_view.len[i]: length of the ith pair
+     * buf_view.idx: index to the offset-length pair currently being
+     *     processed, incremented each round.
+     * buf_view.rem: amount of data in the pair that has not been copied
+     *     over, changed each round.
+     */
+    user_buf_idx = buf_view->off[buf_view->idx]
+                 + buf_view->len[buf_view->idx]
+                 - buf_view->rem;
+                 /* in case data left to be copied from previous round */
+
+    /* fd->flat_file.count: the number of noncontiguous file segments this
+     *     rank writes to. Each segment i is described by fd->flat_file.offs[i]
+     *     and fd->flat_file.len[i].
+     * fd->flat_file.idx: the index to the fd->flat_file.offs[],
+     *     fd->flat_file.len[] that have been processed in the previous round.
+     * The while loop below packs write data into send buffers, send_buf[],
+     * based on this rank's off-len pairs in its file view,
+     */
+    off     = fd->flat_file.off[fd->flat_file.idx]
+            + fd->flat_file.len[fd->flat_file.idx]
+            - fd->flat_file.rem;
+    rem_len = fd->flat_file.rem;
+
+// int rank; MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    while (send_total_size > 0) {
+        /* this off-len request may span to more than one I/O aggregator */
+// if (rank == 0) printf("rank 0 %s at %d send_total_size=%zd rem_len=%lld\n",__func__,__LINE__,send_total_size,rem_len);
+        while (rem_len != 0) {
+            len = rem_len;
+            q = LUSTRE_Calc_aggregator(fd, off, &len);
+            /* NOTE: len will be modified by PNCIO_Calc_aggregator() to be no
+             * more than a file stripe unit size that aggregator "q" is
+             * responsible for. Note q is not the MPI rank ID, It is the array
+             * index to fd->hints->ranklist[].
+             *
+             * Now len is the amount of data in ith off-len pair that should be
+             * sent to aggregator q. Note q can also be self. In this case,
+             * data is also packed into send_buf[q] or pointed to a segment of
+             * buf when the data to be packed is contiguous. send_buf[q] will
+             * later be copied to write buffer in MEMCPY_UNPACK, instead of
+             * calling MPI_Issend to send.
+             *
+             * send_size[q]: data amount of this rank needs to send to
+             * aggregator q in this round.
+             *
+             * len and send_size[q] are all always <= striping_unit
+             */
+
+// if (rank == 0) printf("rank 0 %s at %d rem_len=%lld len=%lld first_q=%d q=%d idx=%lld\n",__func__,__LINE__,rem_len,len,first_q,q,buf_view->idx);
+
+            if (first_q != q) {
+                assert(send_size_rem == 0);
+                first_q = q;
+                isUserBuf = 1;
+                send_size_rem = send_size[q];
+                copy_size = 0;
+                same_buf_ptr = (char*)buf + user_buf_idx; /* no increment */
+                user_buf_ptr = same_buf_ptr; /* increment after each memcpy */
+                if (send_buf != NULL)
+                    send_buf_ptr = send_buf[q]; /* increment after each memcpy */
+            }
+
+            /* copy len amount of data from buf to send_buf[q] */
+            size = len;
+
+            while (size) {
+                MPI_Count size_in_buf = MIN(size, buf_view->rem);
+                copy_size += size_in_buf;
+                user_buf_idx += size_in_buf;
+                send_size_rem -= size_in_buf;
+                buf_view->rem -= size_in_buf;
+// if (rank == 0) printf("rank 0 %s at %d size=%lld size_in_buf=%lld copy_size=%lld rem=%ld\n",__func__,__LINE__, size, size_in_buf, copy_size,buf_view->rem);
+                if (buf_view->rem == 0) { /* move on to next off-len pair */
+                    if (! buf_view->is_contig) {
+                        /* user buffer type is not contiguous */
+                        if (send_size_rem) {
+                            /* after this copy send_buf[q] is still not full */
+                            isUserBuf = 0;
+// if (rank == 0 && (char*)buf == (char*)user_buf_ptr) printf("rank 0 copy original buf 1 size=%lld user_buf_ptr=%p\n",copy_size,user_buf_ptr);
+                            memcpy(send_buf_ptr, user_buf_ptr, copy_size);
+user_buf_ptr += copy_size;
+                            send_buf_ptr += copy_size;
+                            copy_size = 0;
+                        } else if (isUserBuf == 0) {
+                            /* send_buf[q] is full and not using user buf,
+                             * copy the remaining delayed data */
+// if (rank == 0 && (char*)buf == (char*)user_buf_ptr) printf("rank 0 copy original buf 2 size=%lld\n",copy_size);
+                            memcpy(send_buf_ptr, user_buf_ptr, copy_size);
+user_buf_ptr += copy_size;
+                        }
+#ifdef WKL_DEBUG
+num_memcpy++;
+#endif
+                    }
+                    /* update buf_view->idx, buf_view->rem,
+                     * and user_buf_idx
+                     */
+                        buf_view->idx++;
+assert(buf_view->idx <= buf_view->count);
+
+if (buf_view->idx < buf_view->count) {
+                    user_buf_idx = buf_view->off[buf_view->idx];
+                    buf_view->rem = buf_view->len[buf_view->idx];
+                    user_buf_ptr = (char*) buf + user_buf_idx;
+}
+else assert(size - size_in_buf == 0);
+
+                }
+                else if (send_size_rem == 0 && isUserBuf == 0) {
+                    /* buf_view->rem > 0, send_buf[q] is full, and not using
+                     * user buf to send, copy the remaining delayed data
+                     */
+// if (rank == 0 && (char*)buf == (char*)user_buf_ptr) printf("rank 0 copy original buf 3 size=%lld\n",copy_size);
+                    memcpy(send_buf_ptr, user_buf_ptr, copy_size);
+#ifdef WKL_DEBUG
+num_memcpy++;
+#endif
+                    user_buf_ptr += copy_size;
+                }
+                size -= size_in_buf;
+            }
+
+            if (send_size_rem == 0) { /* data to q is fully packed */
+                first_q = -1;
+
+                if (q != fd->my_cb_nodes_index) { /* send only if not self rank */
+                    if (isUserBuf)
+                        CACHE_REQ(send_list[q], send_size[q], same_buf_ptr)
+                    else
+                        CACHE_REQ(send_list[q], send_size[q], send_buf[q])
+                }
+                else if (isUserBuf) {
+                    /* send buffer is also (part of) user's buf. Return the
+                     * buffer pointer, so the self send data can be directly
+                     * unpack from user buf to write buffer.
+                     */
+                    *self_buf = same_buf_ptr;
+                }
+            }
+            /* len is the amount of data copied */
+            off += len;
+            rem_len -= len;
+            fd->flat_file.rem -= len;
+            send_total_size -= len;
+            if (send_total_size == 0) break;
+        }
+
+        /* done with this off-len pair, move on to the next */
+        if (fd->flat_file.rem == 0) {
+            fd->flat_file.idx++;
+            fd->flat_file.rem = fd->flat_file.len[fd->flat_file.idx];
+        }
+        off = fd->flat_file.off[fd->flat_file.idx];
+        rem_len = fd->flat_file.rem;
+    }
+
+#ifdef WKL_DEBUG
+if (num_memcpy> 0) printf("---- fd->flat_file.count=%lld fd->flat_file.idx=%lld buf_view->count=%lld num_memcpy=%d\n",fd->flat_file.count,fd->flat_file.idx,buf_view->count,num_memcpy);
+#endif
+}
+
diff --git a/src/drivers/pncio/pncio_lustre_wrstr.c b/src/drivers/pncio/pncio_lustre_wrstr.c
new file mode 100644
index 000000000..341fab400
--- /dev/null
+++ b/src/drivers/pncio/pncio_lustre_wrstr.c
@@ -0,0 +1,363 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <pncio.h>
+
+
+#define BUFFERED_WRITE {                                                      \
+    if (req_off >= writebuf_off + writebuf_len) {                             \
+        if (writebuf_len) {                                                   \
+            w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len,             \
+                                      writebuf_off);                          \
+            if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE)  \
+                PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);       \
+            if (w_len < 0) {                                                  \
+                NCI_Free(writebuf);                                           \
+                return w_len;                                                 \
+            }                                                                 \
+            total_w_len += w_len;                                             \
+            writebuf_off = req_off;                                           \
+        }                                                                     \
+        writebuf_off = req_off;                                               \
+        /* stripe_size alignment */                                           \
+        writebuf_len = MIN(end_offset - writebuf_off + 1,                     \
+                           (writebuf_off / stripe_size + 1) * stripe_size     \
+                           - writebuf_off);                                   \
+        if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE)      \
+            PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);       \
+        r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off);   \
+        if (r_len < 0) {                                                      \
+            NCI_Free(writebuf);                                               \
+            return r_len;                                                     \
+        }                                                                     \
+    }                                                                         \
+    write_sz = (MIN(req_len, writebuf_off + writebuf_len - req_off));         \
+    memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off,      \
+           write_sz);                                                         \
+    while (write_sz != req_len) {                                             \
+        w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off);  \
+        if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE)      \
+            PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);           \
+        if (w_len < 0) {                                                      \
+            NCI_Free(writebuf);                                               \
+            return w_len;                                                     \
+        }                                                                     \
+        total_w_len += w_len;                                                 \
+        req_len -= write_sz;                                                  \
+        userbuf_off += write_sz;                                              \
+        writebuf_off += writebuf_len;                                         \
+        /* stripe_size alignment */                                           \
+        writebuf_len = MIN(end_offset - writebuf_off + 1,                     \
+                           (writebuf_off / stripe_size + 1) * stripe_size     \
+                           - writebuf_off);                                   \
+        if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE)      \
+            PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);       \
+        r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off);   \
+        if (r_len < 0) {                                                      \
+            NCI_Free(writebuf);                                               \
+            return r_len;                                                     \
+        }                                                                     \
+        write_sz = MIN(req_len, writebuf_len);                                \
+        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);                \
+    }                                                                         \
+}
+
+/* this macro is used when filetype is contig and buftype is not contig.
+ * it does not do a read-modify-write and does not lock
+ */
+#define BUFFERED_WRITE_WITHOUT_READ {                                         \
+    if (req_off >= writebuf_off + writebuf_len) {                             \
+        w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off);  \
+        if (w_len < 0) {                                                      \
+            NCI_Free(writebuf);                                               \
+            return w_len;                                                     \
+        }                                                                     \
+        total_w_len += w_len;                                                 \
+        writebuf_off = req_off;                                               \
+        /* stripe_size alignment */                                           \
+        writebuf_len = MIN(end_offset - writebuf_off + 1,                     \
+                           (writebuf_off / stripe_size + 1) * stripe_size     \
+                           - writebuf_off);                                   \
+    }                                                                         \
+    write_sz = MIN(req_len, writebuf_off + writebuf_len - req_off);           \
+    memcpy(writebuf + req_off - writebuf_off,                                 \
+           (char *)buf + userbuf_off, write_sz);                              \
+    while (write_sz != req_len) {                                             \
+        w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off);  \
+        if (w_len < 0) {                                                      \
+            NCI_Free(writebuf);                                               \
+            return w_len;                                                     \
+        }                                                                     \
+        total_w_len += w_len;                                                 \
+        req_len -= write_sz;                                                  \
+        userbuf_off += write_sz;                                              \
+        writebuf_off += writebuf_len;                                         \
+        /* stripe_size alignment */                                           \
+        writebuf_len = MIN(end_offset - writebuf_off + 1,                     \
+                           (writebuf_off / stripe_size + 1) * stripe_size     \
+                           - writebuf_off);                                   \
+        write_sz = MIN(req_len, writebuf_len);                                \
+        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);                \
+    }                                                                         \
+}
+
+MPI_Offset PNCIO_LUSTRE_WriteStrided(PNCIO_File *fd,
+                                     const void *buf,
+                                     PNCIO_View  buf_view,
+                                     MPI_Offset  offset)
+{
+    char *writebuf;
+    int i, j, k, st_index=0, stripe_size;
+    /* offset is in units of etype relative to the filetype. */
+    MPI_Offset i_offset, sum, num, size, abs_off_in_filetype=0, off, disp;
+    MPI_Offset userbuf_off, req_off, end_offset=0, writebuf_off, start_off;
+    MPI_Offset new_bwr_size, new_fwr_size, st_fwr_size, fwr_size=0, bwr_size;
+    MPI_Offset req_len, r_len, w_len, total_w_len=0;
+    MPI_Count bufsize, writebuf_len, write_sz;
+
+    /* The case of both buftype and filetype being contiguous has gone to
+     * PNCIO_WriteContig().
+     */
+
+// printf("%s at %d:\n",__func__,__LINE__);
+
+    if (fd->hints->ds_write == PNCIO_HINT_DISABLE) {
+        /* if user has disabled data sieving on writes, use naive
+         * approach instead.
+         */
+        return PNCIO_GEN_WriteStrided_naive(fd, buf, buf_view, offset);
+    }
+
+
+/* PnetCDF always sets these 3 conditions */
+assert(fd->filetype == MPI_BYTE);
+assert(fd->flat_file.size == buf_view.size);
+if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */
+
+    bufsize = buf_view.size;
+
+    /* get striping info */
+    stripe_size = fd->hints->striping_unit;
+
+    if (!buf_view.is_contig && fd->flat_file.is_contig) {
+        /* noncontiguous in write buffer, contiguous in file. */
+
+        off = fd->disp + offset;
+        if (fd->flat_file.count > 0) off += fd->flat_file.off[0];
+
+        start_off = off;
+        end_offset = start_off + bufsize - 1;
+
+        /* write stripe size buffer each time */
+        writebuf = (char *) NCI_Malloc(MIN(bufsize, stripe_size));
+        writebuf_off = 0;
+        writebuf_len = 0;
+
+        /* if atomicity is true or data sieving is not disable, lock the region
+         * to be accessed
+         */
+        if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE)
+            PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize);
+
+        for (i = 0; i < buf_view.count; i++) {
+            userbuf_off = buf_view.off[i];
+            req_off = off;
+            req_len = buf_view.len[i];
+            BUFFERED_WRITE_WITHOUT_READ;
+            off += buf_view.len[i];
+        }
+
+        /* write the buffer out the last round */
+        w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off);
+
+        if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE)
+            PNCIO_UNLOCK(fd, start_off, SEEK_SET, bufsize);
+
+        NCI_Free(writebuf);
+
+        if (w_len < 0) return w_len;
+        total_w_len += w_len;
+
+    } else { /* contiguous buffer and non-contiguous in file */
+        disp = fd->disp;
+/* for non-contiguous in file, PnetCDF always uses disp == 0 */
+assert(disp == 0);
+
+        /* find the starting index in fd->flat_file offset-length pairs */
+        sum = 0;
+        for (i = 0; i < fd->flat_file.count; i++) {
+            sum += fd->flat_file.len[i];
+            if (sum > offset) {
+                st_index = i;
+                fwr_size = sum - offset;
+                abs_off_in_filetype = fd->flat_file.off[i] +
+                    offset - (sum - fd->flat_file.len[i]);
+                break;
+            }
+        }
+
+        /* abs. offset in bytes in the file */
+        offset = disp + abs_off_in_filetype;
+
+        start_off = offset;
+
+        /* Write request is within single flat_file contig block. This could
+         * happen, for example, with subarray types that are actually fairly
+         * contiguous.
+         */
+        if (buf_view.is_contig && bufsize <= fwr_size) {
+            req_off = start_off;
+            req_len = bufsize;
+            end_offset = start_off + bufsize - 1;
+            writebuf = (char *) NCI_Malloc(MIN(bufsize, stripe_size));
+            memset(writebuf, -1, (size_t)MIN(bufsize, stripe_size));
+            writebuf_off = 0;
+            writebuf_len = 0;
+            userbuf_off = 0;
+            BUFFERED_WRITE_WITHOUT_READ;
+
+            /* write the buffer out the last round */
+            if (fd->hints->ds_write != PNCIO_HINT_DISABLE)
+                PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+
+            w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off);
+            if (w_len > 0) total_w_len += w_len;
+
+            if (fd->hints->ds_write != PNCIO_HINT_DISABLE)
+                PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+
+            NCI_Free(writebuf);
+
+            return total_w_len;
+        }
+
+        /* Calculate end_offset, the last byte-offset that will be accessed.
+         * e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
+
+        st_fwr_size = fwr_size;
+        j = st_index;
+        i_offset = fwr_size = MIN(st_fwr_size, bufsize);
+        end_offset = offset + fwr_size - 1;
+        while (i_offset < bufsize) {
+            j++;
+assert(j < fd->flat_file.count);
+            off = disp + fd->flat_file.off[j];
+            fwr_size = MIN(fd->flat_file.len[j], bufsize - i_offset);
+            i_offset += fwr_size;
+            end_offset = off + fwr_size - 1;
+        }
+
+        /* if atomicity is true or data sieving is not disable, lock the region
+         * to be accessed */
+        if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE)
+            PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
+
+        writebuf_off = 0;
+        writebuf_len = 0;
+        writebuf = (char *) NCI_Malloc(stripe_size);
+        memset(writebuf, -1, stripe_size);
+
+        if (buf_view.is_contig && !fd->flat_file.is_contig) {
+            /* contiguous in memory, noncontiguous in file should be the most
+             * common case.
+             */
+            i_offset = 0;
+            j = st_index;
+            off = offset;
+            fwr_size = MIN(st_fwr_size, bufsize);
+            while (i_offset < bufsize) {
+                if (fwr_size) {
+                    req_off = off;
+                    req_len = fwr_size;
+                    userbuf_off = i_offset;
+                    BUFFERED_WRITE;
+                }
+                i_offset += fwr_size;
+                if (i_offset >= bufsize) break;
+
+                if (off + fwr_size < disp + fd->flat_file.off[j] +
+                    fd->flat_file.len[j])
+                    off += fwr_size;
+                    /* no more I/O needed. off is incremented by fwr_size. */
+                else {
+                    j++;
+assert(j < fd->flat_file.count);
+                    off = disp + fd->flat_file.off[j];
+                    fwr_size = MIN(fd->flat_file.len[j],
+                                       bufsize - i_offset);
+                }
+            }
+        } else {
+            /* noncontiguous in memory as well as in file */
+            k = num = 0;
+            i_offset = buf_view.off[0];
+            j = st_index;
+            off = offset;
+            fwr_size = st_fwr_size;
+            bwr_size = buf_view.len[0];
+
+            while (num < bufsize) {
+                size = MIN(fwr_size, bwr_size);
+                if (size) {
+                    req_off = off;
+                    req_len = size;
+                    userbuf_off = i_offset;
+                    BUFFERED_WRITE;
+                }
+                num += size;
+                if (num >= bufsize) break;
+
+                new_fwr_size = fwr_size;
+                new_bwr_size = bwr_size;
+
+                if (size == fwr_size) {
+                    /* reached end of contiguous block in file */
+                    j++;
+assert(j < fd->flat_file.count);
+                    off = disp + fd->flat_file.off[j];
+
+                    new_fwr_size = fd->flat_file.len[j];
+                    if (size != bwr_size) {
+                        i_offset += size;
+                        new_bwr_size -= size;
+                    }
+                }
+
+                if (size == bwr_size) {
+                    /* reached end of contiguous block in memory */
+                    k++;
+assert(k < buf_view.count);
+                    i_offset = buf_view.off[k];
+                    new_bwr_size = buf_view.len[k];
+                    if (size != fwr_size) {
+                        off += size;
+                        new_fwr_size -= size;
+                    }
+                }
+                fwr_size = new_fwr_size;
+                bwr_size = new_bwr_size;
+            }
+        }
+
+        /* write the buffer out the last round */
+        if (writebuf_len) {
+            w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off);
+            if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE)
+                PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+            if (w_len < 0) return w_len;
+            total_w_len += w_len;
+        }
+        if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE)
+            PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+        NCI_Free(writebuf);
+    }
+
+    return buf_view.size;
+}
diff --git a/src/drivers/pncio/pncio_open.c b/src/drivers/pncio/pncio_open.c
new file mode 100644
index 000000000..38981b2c8
--- /dev/null
+++ b/src/drivers/pncio/pncio_open.c
@@ -0,0 +1,344 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>      /* open(), O_CREAT */
+#include <sys/types.h>  /* open(), umask() */
+#include <sys/stat.h>   /* umask() */
+
+#include <assert.h>
+#include <sys/errno.h>
+
+#include <mpi.h>
+
+#include "pncio.h"
+
+/*----< GEN_set_cb_node_list() >---------------------------------------------*/
+/* Construct the list of I/O aggregators. It sets the followings.
+ *   fd->hints->ranklist[].
+ *   fd->hints->cb_nodes and set file info for hint cb_nodes.
+ *   fd->is_agg: indicating whether this rank is an I/O aggregator
+ *   fd->my_cb_nodes_index: index into fd->hints->ranklist[]. -1 if N/A
+ */
+static
+int GEN_set_cb_node_list(PNCIO_File *fd)
+{
+    int i, j, k, nprocs, rank, *nprocs_per_node, **ranks_per_node;
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &rank);
+
+    if (fd->hints->cb_nodes == 0)
+        /* If hint cb_nodes is not set by user, select one rank per node to be
+         * an I/O aggregator
+         */
+        fd->hints->cb_nodes = fd->num_nodes;
+    else if (fd->hints->cb_nodes > nprocs)
+        /* cb_nodes must be <= nprocs */
+        fd->hints->cb_nodes = nprocs;
+
+    fd->hints->ranklist = (int *) NCI_Malloc(sizeof(int) * fd->hints->cb_nodes);
+    if (fd->hints->ranklist == NULL)
+        return NC_ENOMEM;
+
+    /* number of MPI processes running on each node */
+    nprocs_per_node = (int *) NCI_Calloc(fd->num_nodes, sizeof(int));
+
+    for (i=0; i<nprocs; i++) nprocs_per_node[fd->node_ids[i]]++;
+
+    /* construct rank IDs of MPI processes running on each node */
+    ranks_per_node = (int **) NCI_Malloc(sizeof(int*) * fd->num_nodes);
+    ranks_per_node[0] = (int *) NCI_Malloc(sizeof(int) * nprocs);
+    for (i=1; i<fd->num_nodes; i++)
+        ranks_per_node[i] = ranks_per_node[i - 1] + nprocs_per_node[i - 1];
+
+    for (i=0; i<fd->num_nodes; i++) nprocs_per_node[i] = 0;
+
+    /* Populate ranks_per_node[], list of MPI ranks running on each node.
+     * Populate nprocs_per_node[], number of MPI processes on each node.
+     */
+    for (i=0; i<nprocs; i++) {
+        k = fd->node_ids[i];
+        ranks_per_node[k][nprocs_per_node[k]] = i;
+        nprocs_per_node[k]++;
+    }
+
+    /* select process ranks from nodes in a round-robin fashion to be I/O
+     * aggregators
+     */
+    k = j = 0;
+    for (i=0; i<fd->hints->cb_nodes; i++) {
+        if (j >= nprocs_per_node[k]) { /* if run out of ranks in this node k */
+            k++;
+            if (k == fd->num_nodes) { /* round-robin to first node */
+                k = 0;
+                j++;
+            }
+        }
+        /* select jth rank of node k as an I/O aggregator */
+        fd->hints->ranklist[i] = ranks_per_node[k++][j];
+        if (rank == fd->hints->ranklist[i]) {
+            fd->is_agg = 1;
+            fd->my_cb_nodes_index = i;
+        }
+        if (k == fd->num_nodes) { /* round-robin to first node */
+            k = 0;
+            j++;
+        }
+    }
+    NCI_Free(ranks_per_node[0]);
+    NCI_Free(ranks_per_node);
+    NCI_Free(nprocs_per_node);
+
+    return 0;
+}
+
+/*----< GEN_create() >-------------------------------------------------------*/
+/*   1. root creates the file
+ *   2. root sets and obtains striping info
+ *   3. root broadcasts striping info
+ *   4. non-root processes receive striping info from root
+ *   5. non-root processes opens the fie
+ */
+static int
+GEN_create(PNCIO_File *fd,
+           int         mpi_io_mode)
+{
+    int err=NC_NOERR, rank, amode, perm, old_mask;
+    int stripin_info[4] = {-1, -1, -1, -1};
+
+    MPI_Comm_rank(fd->comm, &rank);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+if (rank == 0) { printf("\nxxxx %s at %d: ---- %s\n",__func__,__LINE__,fd->filename); fflush(stdout);}
+#endif
+
+    amode = O_CREAT;
+    if (mpi_io_mode & MPI_MODE_RDWR) amode |= O_RDWR;
+
+    old_mask = umask(022);
+    umask(old_mask);
+    perm = old_mask ^ PNCIO_PERM;
+
+    /* root process creates the file first, followed by all processes open the
+     * file.
+     */
+    if (rank > 0) goto err_out;
+
+    fd->fd_sys = open(fd->filename, amode, perm);
+    if (fd->fd_sys == -1) {
+        fprintf(stderr,"%s line %d: rank %d fails to create file %s (%s)\n",
+                __func__,__LINE__, rank, fd->filename, strerror(errno));
+        err = ncmpii_error_posix2nc("open");
+        goto err_out;
+    }
+
+err_out:
+    MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm);
+
+    fd->hints->striping_unit   = stripin_info[0];
+    fd->hints->striping_factor = stripin_info[1];
+    fd->hints->start_iodevice  = stripin_info[2];
+
+    if (rank > 0) { /* non-root processes */
+        fd->fd_sys = open(fd->filename, O_RDWR, perm);
+        if (fd->fd_sys == -1) {
+            fprintf(stderr,"%s line %d: rank %d failure to open file %s (%s)\n",
+                    __func__,__LINE__, rank, fd->filename, strerror(errno));
+            return ncmpii_error_posix2nc("ioctl");
+        }
+    }
+
+    /* construct cb_nodes rank list */
+    GEN_set_cb_node_list(fd);
+    MPI_Info_set(fd->info, "romio_filesystem_type", "UFS:");
+
+    return err;
+}
+
+/*----< GEN_open() >---------------------------------------------------------*/
+/*   1. all processes open the file.
+ *   2. root obtains striping info and broadcasts to all others
+ */
+static int
+GEN_open(PNCIO_File *fd)
+{
+    int err=NC_NOERR, rank, perm, old_mask, omode;
+    int stripin_info[4] = {1048576, -1, -1, -1};
+
+    MPI_Comm_rank(fd->comm, &rank);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+if (rank == 0) { printf("\nxxxx %s at %d: ---- %s\n",__func__,__LINE__,fd->filename); fflush(stdout);}
+#endif
+
+    old_mask = umask(022);
+    umask(old_mask);
+    perm = old_mask ^ PNCIO_PERM;
+
+    if (fIsSet(fd->access_mode, MPI_MODE_RDWR))
+        omode = O_RDWR;
+    else
+        omode = O_RDONLY;
+
+    /* All processes open the file. */
+    fd->fd_sys = open(fd->filename, omode, perm);
+    if (fd->fd_sys == -1) {
+        fprintf(stderr, "%s line %d: rank %d failure to open file %s (%s)\n",
+                __func__,__LINE__, rank, fd->filename, strerror(errno));
+        err = ncmpii_error_posix2nc("open");
+        goto err_out;
+    }
+
+    /* Only root obtains the striping information and bcast to all other
+     * processes.
+     */
+    if (rank == 0) {
+        /* Get the underlying file system block size as file striping_unit */
+        struct stat statbuf;
+        err = fstat(fd->fd_sys, &statbuf);
+        if (err >= 0)
+            /* file system block size usually < MAX_INT */
+            stripin_info[0] = (int)statbuf.st_blksize;
+    }
+
+err_out:
+    MPI_Bcast(stripin_info, 4, MPI_INT, 0, fd->comm);
+    fd->hints->striping_unit   = stripin_info[0];
+    fd->hints->striping_factor = stripin_info[1];
+    fd->hints->start_iodevice  = stripin_info[2];
+
+    /* construct cb_nodes rank list */
+    GEN_set_cb_node_list(fd);
+    MPI_Info_set(fd->info, "romio_filesystem_type", "UFS:");
+
+    return err;
+}
+
+/*----< PNCIO_File_open() >---------------------------------------------------*/
+int PNCIO_File_open(MPI_Comm    comm,
+                    const char *filename,
+                    int         amode,
+                    MPI_Info    info,
+                    PNCIO_File *fd)
+{
+    /* Before reaching to this subroutine, PNCIO_FileSysType() should have been
+     * called to check the file system type.
+     */
+    char value[MPI_MAX_INFO_VAL + 1], int_str[16];
+    int i, err, min_err;
+
+    fd->comm        = comm;
+    fd->filename    = filename;  /* without file system type name prefix */
+    fd->atomicity   = 0;
+    fd->filetype    = MPI_BYTE;
+    fd->is_open     = 0;
+    fd->access_mode = amode;
+    fd->io_buf      = NULL; /* collective buffer used by aggregators only */
+
+    fd->flat_file.count = 0; /* flattend fileview in offset-length pairs */
+    fd->flat_file.size = -1;
+    fd->flat_file.is_contig = 1;
+    fd->flat_file.off = NULL;
+    fd->flat_file.len = NULL;
+
+    /* create and initialize info object */
+    fd->hints = (PNCIO_Hints*) NCI_Calloc(1, sizeof(PNCIO_Hints));
+    if (info == MPI_INFO_NULL)
+        MPI_Info_create(&fd->info);
+    else
+        MPI_Info_dup(info, &fd->info);
+
+    err = PNCIO_File_SetInfo(fd, fd->info);
+    if (err != NC_NOERR)
+        return err;
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    for (i=0; i<NMEASURES; i++) {
+        fd->write_timing[i]  = fd->read_timing[i]  = 0;
+        fd->write_counter[i] = fd->read_counter[i] = 0;
+    }
+#endif
+
+    assert(fd->file_system != PNCIO_FSTYPE_MPIIO);
+
+    if (fd->file_system == PNCIO_LUSTRE) {
+        if (amode & MPI_MODE_CREATE)
+            err = PNCIO_Lustre_create(fd, amode);
+        else
+            err = PNCIO_Lustre_open(fd);
+    }
+    else {
+        if (amode & MPI_MODE_CREATE)
+            err = GEN_create(fd, amode);
+        else
+            err = GEN_open(fd);
+    }
+    if (err != NC_NOERR) goto err_out;
+
+    /* TODO: when hint no_indep_rw hint is set to true, only aggregators open
+     * the file */
+    fd->is_open = 1;
+
+    /* set file striping hints */
+    snprintf(int_str, 16, "%d", fd->hints->striping_unit);
+    MPI_Info_set(fd->info, "striping_unit", int_str);
+
+    snprintf(int_str, 16, "%d", fd->hints->striping_factor);
+    MPI_Info_set(fd->info, "striping_factor", int_str);
+
+    snprintf(int_str, 16, "%d", fd->hints->start_iodevice);
+    MPI_Info_set(fd->info, "start_iodevice", int_str);
+
+    /* set file striping hints */
+    snprintf(int_str, 16, "%d", fd->hints->cb_nodes);
+    MPI_Info_set(fd->info, "cb_nodes", int_str);
+
+    /* add hint "cb_node_list", list of aggregators' rank IDs */
+    snprintf(value, 16, "%d", fd->hints->ranklist[0]);
+    for (i=1; i<fd->hints->cb_nodes; i++) {
+        snprintf(int_str, 16, " %d", fd->hints->ranklist[i]);
+        if (strlen(value) + strlen(int_str) >= MPI_MAX_INFO_VAL-5) {
+            strcat(value, " ...");
+            break;
+        }
+        strcat(value, int_str);
+    }
+    MPI_Info_set(fd->info, "cb_node_list", value);
+
+    /* collective buffer size must be at least file striping size */
+    if (fd->hints->cb_buffer_size < fd->hints->striping_unit) {
+        fd->hints->cb_buffer_size = fd->hints->striping_unit;
+        snprintf(int_str, 16, " %d", fd->hints->cb_buffer_size);
+        MPI_Info_set(fd->info, "cb_buffer_size", int_str);
+    }
+
+    /* collective buffer is used only by I/O aggregators only */
+    if (fd->is_agg) {
+        fd->io_buf = NCI_Calloc(1, fd->hints->cb_buffer_size);
+        if (fd->io_buf == NULL)
+            return NC_ENOMEM;
+    }
+
+err_out:
+    MPI_Allreduce(&err, &min_err, 1, MPI_INT, MPI_MIN, comm);
+    /* All NC errors are < 0 */
+    if (min_err < 0) {
+        if (err == 0) /* close file if opened successfully */
+            close(fd->fd_sys);
+        NCI_Free(fd->hints);
+        if (fd->info != MPI_INFO_NULL)
+            MPI_Info_free(&(fd->info));
+        if (fd->io_buf != NULL)
+            NCI_Free(fd->io_buf);
+    }
+    return err;
+}
+
diff --git a/src/drivers/pncio/pncio_read.c b/src/drivers/pncio/pncio_read.c
new file mode 100644
index 000000000..a7594ed2f
--- /dev/null
+++ b/src/drivers/pncio/pncio_read.c
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/errno.h>
+#include <unistd.h>   /* pread() */
+
+#include <mpi.h>
+
+#include "pncio.h"
+
+/*----< PNCIO_ReadContig() >--------------------------------------------------*/
+MPI_Offset PNCIO_ReadContig(PNCIO_File *fd,
+                            void       *buf,
+                            MPI_Offset  r_size,
+                            MPI_Offset  offset)
+{
+    ssize_t err = 0;
+    size_t r_count;
+    MPI_Offset bytes_xfered = 0;
+    char *p;
+
+// printf("%s at %d: %s pread offset=%lld r_size=%lld\n",__func__,__LINE__,fd->filename,offset,r_size);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    double timing = MPI_Wtime();
+#endif
+    p = (char *) buf;
+    while (bytes_xfered < r_size) {
+        r_count = r_size - bytes_xfered;
+        err = pread(fd->fd_sys, p, r_count, offset + bytes_xfered);
+        if (err == -1)
+            goto ioerr;
+        if (err == 0)
+            break;
+        bytes_xfered += err;
+        p += err;
+    }
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    fd->read_timing[2] += MPI_Wtime() - timing;
+#endif
+
+ioerr:
+    if (err == -1)
+        bytes_xfered = ncmpii_error_posix2nc("pread");
+
+/*
+if (offset > 0) {unsigned long long wkl[4];
+    memcpy(wkl, buf, sizeof(unsigned long long) * 4);
+    ncmpii_in_swapn(wkl, 4, 8);
+    printf("%s at %d: %s pread offset=%lld r_size=%lld wkl=%llu %lld %lld %lld\n",__func__,__LINE__,fd->filename,offset,r_size,wkl[0],wkl[1],wkl[2],wkl[3]);
+}
+*/
+
+    return bytes_xfered;
+}
+
+/*----< file_read() >--------------------------------------------------------*/
+/* This is an independent call. */
+static
+MPI_Offset file_read(PNCIO_File *fd,
+                     MPI_Offset  offset, /* relative to fileview */
+                     void       *buf,
+                     PNCIO_View  buf_view)
+{
+    MPI_Offset r_len=0;
+
+// printf("%s at %d: offset=%lld buf_view size=%lld\n",__func__,__LINE__, offset,buf_view.size);
+
+assert(fd->filetype == MPI_BYTE);
+if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */
+
+    if (buf_view.size == 0) /* zero-sized request */
+        return NC_NOERR;
+
+    if (buf_view.is_contig && fd->flat_file.is_contig) {
+        if (fd->flat_file.count > 0) offset += fd->flat_file.off[0];
+        r_len = PNCIO_ReadContig(fd, buf, buf_view.size, offset);
+    }
+    else
+        r_len = PNCIO_GEN_ReadStrided(fd, buf, buf_view, offset);
+
+    return r_len;
+}
+
+/*----< PNCIO_File_read_at() >------------------------------------------------*/
+/* This is an independent call.
+ * offset is a position in the file relative to the current view, expressed as
+ * a count of etypes.
+ */
+MPI_Offset PNCIO_File_read_at(PNCIO_File *fh,
+                              MPI_Offset  offset,
+                              void       *buf,
+                              PNCIO_View  buf_view)
+{
+    assert(fh != NULL);
+
+    if (buf_view.size == 0) return NC_NOERR;
+
+    if (buf_view.size < 0) return NC_ENEGATIVECNT;
+
+    /* PnetCDF has only 2 modes: read-only and read-write */
+    // if (fh->access_mode & MPI_MODE_RDONLY) return NC_EPERM;
+
+    return file_read(fh, offset, buf, buf_view);
+}
+
+/*----< PNCIO_File_read_at_all() >--------------------------------------------*/
+/* This is a collective call.
+ * offset is a position in the file relative to the current view, expressed as
+ * a count of etypes.
+ */
+MPI_Offset PNCIO_File_read_at_all(PNCIO_File *fh,
+                                  MPI_Offset  offset,
+                                  void       *buf,
+                                  PNCIO_View  buf_view)
+{
+    int err=NC_NOERR;
+    MPI_Offset r_len;
+
+    assert(fh != NULL);
+
+    if (buf_view.size < 0) err = NC_ENEGATIVECNT;
+
+    /* PnetCDF has only 2 modes: read-only and read-write */
+    // if (fh->access_mode & MPI_MODE_RDONLY && st == NC_NOERR) st = NC_EPERM;
+
+    r_len = PNCIO_GEN_ReadStridedColl(fh, buf, buf_view, offset);
+
+    return (err == NC_NOERR) ?  r_len :  err;
+}
+
diff --git a/src/drivers/pncio/pncio_read_coll.c b/src/drivers/pncio/pncio_read_coll.c
new file mode 100644
index 000000000..78af29b48
--- /dev/null
+++ b/src/drivers/pncio/pncio_read_coll.c
@@ -0,0 +1,791 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdbool.h> /* type bool */
+
+#include <pncio.h>
+
+/* prototypes of functions used for collective reads only. */
+static
+MPI_Offset Read_and_exch(PNCIO_File *fd, void *buf,
+                          PNCIO_View buf_view, int nprocs,
+                          int myrank, PNCIO_Access *others_req,
+                          MPI_Offset
+                          min_st_offset, MPI_Offset fd_size,
+                          MPI_Offset * fd_start, MPI_Offset * fd_end,
+                          MPI_Aint * buf_idx);
+
+static void R_Exchange_data(PNCIO_File *fd, void *buf,
+                            PNCIO_View buf_view,
+                            MPI_Count * send_size, MPI_Count * recv_size,
+                            MPI_Count * count, MPI_Count * start_pos,
+                            MPI_Count * partial_send,
+                            MPI_Count * recd_from_proc, int nprocs,
+                            int myrank,
+                            MPI_Offset min_st_offset,
+                            MPI_Offset fd_size,
+                            MPI_Offset * fd_start, MPI_Offset * fd_end,
+                            PNCIO_Access * others_req,
+                            int iter, MPI_Aint * buf_idx,
+                            MPI_Aint * actual_recved_bytes);
+static void Fill_user_buffer(PNCIO_File *fd, void *buf,
+                             PNCIO_View buf_view, char **recv_buf,
+                             MPI_Count * recv_size,
+                             MPI_Count * recd_from_proc, int nprocs,
+                             MPI_Offset min_st_offset,
+                             MPI_Offset fd_size, MPI_Offset * fd_start,
+                             MPI_Offset * fd_end);
+
+MPI_Offset PNCIO_GEN_ReadStridedColl(PNCIO_File *fd,
+                                     void       *buf,
+                                     PNCIO_View  buf_view,
+                                     MPI_Offset  offset)
+{
+/* Uses a generalized version of the extended two-phase method described
+   in "An Extended Two-Phase Method for Accessing Sections of
+   Out-of-Core Arrays", Rajeev Thakur and Alok Choudhary,
+   Scientific Programming, (5)4:301--317, Winter 1996.
+   http://www.mcs.anl.gov/home/thakur/ext2ph.ps */
+
+    PNCIO_Access *my_req;
+    /* array of nprocs structures, one for each other process in
+     * whose file domain this process's request lies */
+
+    PNCIO_Access *others_req;
+    /* array of nprocs structures, one for each other process
+     * whose request lies in this process's file domain. */
+
+    int nprocs, nprocs_for_coll, myrank;
+    int interleave_count = 0;
+    MPI_Count *count_my_req_per_proc, count_my_req_procs;
+    MPI_Count *count_others_req_per_proc, count_others_req_procs;
+    MPI_Offset start_offset, end_offset, fd_size, min_st_offset;
+    MPI_Offset *st_offsets = NULL, *fd_start = NULL,
+        *fd_end = NULL, *end_offsets = NULL;
+    MPI_Aint *buf_idx = NULL;
+    MPI_Offset r_len, total_r_len=0;
+
+// printf("%s at %d:\n",__func__,__LINE__);
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+double curT = MPI_Wtime();
+#endif
+
+    /* number of aggregators, cb_nodes, is stored in the hints */
+    nprocs_for_coll = fd->hints->cb_nodes;
+
+    /* only check for interleaving if cb_read isn't disabled */
+    if (fd->hints->cb_read != PNCIO_HINT_DISABLE) {
+        /* For this process's request, calculate the file start and end
+         * offsets. Note: end_offset points to the last byte-offset that will
+         * be accessed, e.g., if start_offset=0 and 100 bytes to be read,
+         * end_offset=99
+         */
+        if (fd->flat_file.size == 0) {
+            start_offset = 0;
+            end_offset = -1;
+        }
+        else if (fd->flat_file.count > 0) {
+            start_offset = offset + fd->flat_file.off[0];
+            end_offset   = fd->flat_file.off[fd->flat_file.count-1]
+                         + fd->flat_file.len[fd->flat_file.count-1] - 1;
+        }
+        else {
+            start_offset = offset;
+            end_offset   = offset + fd->flat_file.size - 1;
+        }
+
+        /* each process communicates its start and end offsets to other
+         * processes. The result is an array each of start and end offsets
+         * stored in order of process rank. */
+        st_offsets = (MPI_Offset *) NCI_Malloc(nprocs * 2 * sizeof(MPI_Offset));
+        end_offsets = st_offsets + nprocs;
+
+        MPI_Allgather(&start_offset, 1, MPI_OFFSET, st_offsets, 1, MPI_OFFSET,
+                      fd->comm);
+        MPI_Allgather(&end_offset, 1, MPI_OFFSET, end_offsets, 1, MPI_OFFSET,
+                      fd->comm);
+
+        /* Are the accesses of different processes interleaved? Below is a
+         * rudimentary check for interleaving, but should suffice for the
+         * moment. */
+        for (int i = 1; i < nprocs; i++)
+            if ((st_offsets[i] < end_offsets[i - 1]) &&
+                (st_offsets[i] <= end_offsets[i]))
+                interleave_count++;
+    }
+
+    if (fd->hints->cb_read == PNCIO_HINT_DISABLE
+        || (!interleave_count && (fd->hints->cb_read == PNCIO_HINT_AUTO))) {
+        /* switch to independent read */
+
+        if (st_offsets != NULL) NCI_Free(st_offsets);
+
+        if (buf_view.size == 0) return 0;
+
+/* PnetCDF always sets this condition, i.e. when fileview is non-contiguous, offset in this call is always 0. */
+if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */
+
+        if (buf_view.is_contig && fd->flat_file.is_contig) {
+            if (fd->flat_file.count > 0) offset += fd->flat_file.off[0];
+            return PNCIO_ReadContig(fd, buf, buf_view.size, offset);
+        }
+        else
+            return PNCIO_GEN_ReadStrided(fd, buf, buf_view, offset);
+    }
+
+    /* We're going to perform aggregation of I/O.  Here we call
+     * PNCIO_Calc_file_domains() to determine what processes will handle I/O
+     * to what regions.  We pass nprocs_for_coll into this function; it is
+     * used to determine how many processes will perform I/O, which is also
+     * the number of regions into which the range of bytes must be divided.
+     * These regions are called "file domains", or FDs.
+     *
+     * When this function returns, fd_start, fd_end, fd_size, and
+     * min_st_offset will be filled in.  fd_start holds the starting byte
+     * location for each file domain.  fd_end holds the ending byte location.
+     * min_st_offset holds the minimum byte location that will be accessed.
+     *
+     * Both fd_start[] and fd_end[] are indexed by an aggregator number; this
+     * needs to be mapped to an actual rank in the communicator later.
+     *
+     */
+    PNCIO_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll,
+                            &min_st_offset, &fd_start, &fd_end, &fd_size,
+                            fd->hints->striping_unit);
+
+    /* calculate where the portions of the access requests of this process
+     * are located in terms of the file domains.  this could be on the same
+     * process or on other processes.  this function fills in:
+     * count_my_req_procs - number of processes (including this one) for which
+     *     this process has requests in their file domain
+     * count_my_req_per_proc - count of requests for each process, indexed
+     *     by rank of the process
+     * my_req[] - array of data structures describing the requests to be
+     *     performed by each process (including self).  indexed by rank.
+     * buf_idx[] - array of locations into which data can be directly moved;
+     *     this is only valid for contiguous buffer case
+     */
+    PNCIO_Calc_my_req(fd, min_st_offset, fd_start, fd_end, fd_size, nprocs,
+                      &count_my_req_procs, &count_my_req_per_proc, &my_req,
+                      &buf_idx);
+
+    /* perform a collective communication in order to distribute the
+     * data calculated above.  fills in the following:
+     * count_others_req_procs - number of processes (including this
+     *     one) which have requests in this process's file domain.
+     * count_others_req_per_proc[] - number of separate contiguous
+     *     requests from proc i lie in this process's file domain.
+     */
+    PNCIO_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc,
+                          my_req, nprocs, myrank, &count_others_req_procs,
+                          &count_others_req_per_proc, &others_req);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    if (fd->is_agg) fd->read_timing[1] += MPI_Wtime() - curT;
+#endif
+
+    /* read data in sizes of no more than collective buffer size,
+     * communicate, and fill user buf.
+     */
+    r_len = Read_and_exch(fd, buf, buf_view, nprocs, myrank, others_req,
+                          min_st_offset, fd_size, fd_start, fd_end, buf_idx);
+    if (r_len > 0) total_r_len += r_len;
+
+    /* free all memory allocated for collective I/O */
+    PNCIO_Free_my_req(count_my_req_per_proc, my_req, buf_idx);
+    PNCIO_Free_others_req(count_others_req_per_proc, others_req);
+
+    NCI_Free(st_offsets);
+    NCI_Free(fd_start);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    if (fd->is_agg) fd->read_timing[0] += MPI_Wtime() - curT;
+#endif
+
+    return (r_len < 0) ? r_len : total_r_len;
+}
+
+static
+MPI_Offset Read_and_exch(PNCIO_File *fd, void *buf,
+                         PNCIO_View buf_view, int nprocs,
+                         int myrank, PNCIO_Access *others_req,
+                         MPI_Offset min_st_offset, MPI_Offset fd_size,
+                         MPI_Offset * fd_start, MPI_Offset * fd_end,
+                         MPI_Aint * buf_idx)
+{
+/* Read in sizes of no more than coll_bufsize, an info parameter.
+   Send data to appropriate processes.
+   Place recd. data in user buf.
+   The idea is to reduce the amount of extra memory required for
+   collective I/O. If all data were read all at once, which is much
+   easier, it would require temp space more than the size of user_buf,
+   which is often unacceptable. For example, to read a distributed
+   array from a file, where each local array is 8Mbytes, requiring
+   at least another 8Mbytes of temp space is unacceptable. */
+
+    int i, m, ntimes, max_ntimes;
+    MPI_Offset st_loc = -1, end_loc = -1, off, done, real_off;
+    char *read_buf = NULL, *tmp_buf;
+    MPI_Count *curr_offlen_ptr, *count, *send_size, *recv_size;
+    MPI_Count *partial_send, *recd_from_proc, *start_pos;
+    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */
+    MPI_Offset real_size, size, for_curr_iter, for_next_iter;
+    int rank;
+    MPI_Aint coll_bufsize;
+    MPI_Aint actual_recved_bytes = 0;
+    MPI_Offset r_len;
+
+/* calculate the number of reads of size coll_bufsize
+   to be done by each process and the max among all processes.
+   That gives the no. of communication phases as well.
+   coll_bufsize is obtained from the hints object. */
+
+    coll_bufsize = fd->hints->cb_buffer_size;
+
+    /* grab some initial values for st_loc and end_loc */
+    for (i = 0; i < nprocs; i++) {
+        if (others_req[i].count) {
+            st_loc = others_req[i].offsets[0];
+            end_loc = others_req[i].offsets[0];
+            break;
+        }
+    }
+
+    /* now find the real values */
+    for (i = 0; i < nprocs; i++)
+        for (MPI_Count j = 0; j < others_req[i].count; j++) {
+            st_loc = MIN(st_loc, others_req[i].offsets[j]);
+            end_loc = MAX(end_loc, (others_req[i].offsets[j]
+                                        + others_req[i].lens[j] - 1));
+        }
+
+    /* calculate ntimes, the number of times this process must perform I/O
+     * operations in order to complete all the requests it has received.
+     * the need for multiple I/O operations comes from the restriction that
+     * we only use coll_bufsize bytes of memory for internal buffering.
+     */
+    if ((st_loc == -1) && (end_loc == -1)) {
+        /* this process does no I/O. */
+        ntimes = 0;
+    } else {
+        /* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize) */
+        ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize);
+    }
+
+    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    fd->read_counter[0] = MAX(fd->read_counter[0], max_ntimes);
+#endif
+
+    read_buf = fd->io_buf;      /* Allocated at open time */
+
+    curr_offlen_ptr = NCI_Calloc(nprocs * 7, sizeof(*curr_offlen_ptr));
+    /* its use is explained below. calloc initializes to 0. */
+
+    count = curr_offlen_ptr + nprocs;
+    /* to store count of how many off-len pairs per proc are satisfied
+     * in an iteration. */
+
+    partial_send = count + nprocs;
+    /* if only a portion of the last off-len pair is sent to a process
+     * in a particular iteration, the length sent is stored here.
+     * calloc initializes to 0. */
+
+    send_size = partial_send + nprocs;
+    /* total size of data to be sent to each proc. in an iteration */
+
+    recv_size = send_size + nprocs;
+    /* total size of data to be recd. from each proc. in an iteration.
+     * Of size nprocs so that I can use MPI_Alltoall later. */
+
+    recd_from_proc = recv_size + nprocs;
+    /* amount of data recd. so far from each proc. Used in Fill_user_buffer.
+     * initialized to 0 here. */
+
+    start_pos = recd_from_proc + nprocs;
+    /* used to store the starting value of curr_offlen_ptr[i] in
+     * this iteration */
+
+    done = 0;
+    off = st_loc;
+    for_curr_iter = for_next_iter = 0;
+
+    MPI_Comm_rank(fd->comm, &rank);
+
+    for (m = 0; m < ntimes; m++) {
+        /* read buf of size coll_bufsize (or less) */
+        /* go through all others_req and check if any are satisfied
+         * by the current read */
+
+        /* since MPI guarantees that displacements in filetypes are in
+         * monotonically nondecreasing order, I can maintain a pointer
+         * (curr_offlen_ptr) to
+         * current off-len pair for each process in others_req and scan
+         * further only from there. There is still a problem of filetypes
+         * such as:  (1, 2, 3 are not process nos. They are just numbers for
+         * three chunks of data, specified by a filetype.)
+         *
+         * 1  -------!--
+         * 2    -----!----
+         * 3       --!-----
+         *
+         * where ! indicates where the current read_size limitation cuts
+         * through the filetype.  I resolve this by reading up to !, but
+         * filling the communication buffer only for 1. I copy the portion
+         * left over for 2 into a tmp_buf for use in the next
+         * iteration. i.e., 2 and 3 will be satisfied in the next
+         * iteration. This simplifies filling in the user's buf at the
+         * other end, as only one off-len pair with incomplete data
+         * will be sent. I also don't need to send the individual
+         * offsets and lens along with the data, as the data is being
+         * sent in a particular order. */
+
+        /* off = start offset in the file for the data actually read in
+         * this iteration
+         * size = size of data read corresponding to off
+         * real_off = off minus whatever data was retained in memory from
+         * previous iteration for cases like 2, 3 illustrated above
+         * real_size = size plus the extra corresponding to real_off
+         * req_off = off in file for a particular contiguous request
+         * minus what was satisfied in previous iteration
+         * req_size = size corresponding to req_off */
+
+        size = MIN(coll_bufsize, end_loc - st_loc + 1 - done);
+        bool flag = false;
+        for (i = 0; i < nprocs; i++) {
+            if (others_req[i].count) {
+                for (MPI_Count j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
+                    MPI_Offset req_off;
+                    if (partial_send[i]) {
+                        req_off = others_req[i].offsets[j] + partial_send[i];
+                    } else {
+                        req_off = others_req[i].offsets[j];
+                    }
+                    if (req_off < off + size) {
+                        flag = true;
+                    }
+                }
+            }
+        }
+        if (flag) {
+            /* This should be only reached by I/O aggregators only */
+            r_len = PNCIO_ReadContig(fd, read_buf + for_curr_iter, size, off);
+            if (r_len < 0) return r_len;
+            size = r_len;
+        }
+
+        real_off = off - for_curr_iter;
+        real_size = size + for_curr_iter;
+
+        for (i = 0; i < nprocs; i++)
+            count[i] = send_size[i] = 0;
+        for_next_iter = 0;
+
+        for (i = 0; i < nprocs; i++) {
+            if (others_req[i].count) {
+                start_pos[i] = curr_offlen_ptr[i];
+                MPI_Count j = 0;
+                for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
+                    MPI_Offset req_off;
+#ifdef HAVE_MPI_LARGE_COUNT
+                    MPI_Offset req_len;
+#else
+                    int req_len;
+#endif
+                    if (partial_send[i]) {
+                        /* this request may have been partially
+                         * satisfied in the previous iteration. */
+                        req_off = others_req[i].offsets[j] + partial_send[i];
+                        req_len = others_req[i].lens[j] - partial_send[i];
+                        partial_send[i] = 0;
+                        /* modify the off-len pair to reflect this change */
+                        others_req[i].offsets[j] = req_off;
+                        others_req[i].lens[j] = req_len;
+                    } else {
+                        req_off = others_req[i].offsets[j];
+                        req_len = others_req[i].lens[j];
+                    }
+                    if (req_off < real_off + real_size) {
+                        count[i]++;
+                        MPI_Aint addr;
+                        MPI_Get_address(read_buf + req_off - real_off, &addr);
+                        others_req[i].mem_ptrs[j] = addr;
+                        send_size[i] += (MIN(real_off + real_size - req_off, req_len));
+
+                        if (real_off + real_size - req_off < req_len) {
+                            partial_send[i] = (real_off + real_size - req_off);
+                            if ((j + 1 < others_req[i].count) &&
+                                (others_req[i].offsets[j + 1] < real_off + real_size)) {
+                                /* this is the case illustrated in the
+                                 * figure above. */
+                                for_next_iter = MAX(for_next_iter,
+                                                        real_off + real_size -
+                                                        others_req[i].offsets[j + 1]);
+                                /* max because it must cover requests
+                                 * from different processes */
+                            }
+                            break;
+                        }
+                    } else
+                        break;
+                }
+                curr_offlen_ptr[i] = j;
+            }
+        }
+
+        for_curr_iter = for_next_iter;
+
+        MPI_Aint recved_bytes = 0;
+        R_Exchange_data(fd, buf, buf_view, send_size, recv_size, count,
+                        start_pos, partial_send, recd_from_proc, nprocs,
+                        myrank, min_st_offset, fd_size, fd_start, fd_end,
+                        others_req, m, buf_idx, &recved_bytes);
+        actual_recved_bytes += recved_bytes;
+
+
+        if (for_next_iter) {
+            tmp_buf = (char *) NCI_Malloc(for_next_iter);
+            memcpy(tmp_buf, read_buf + real_size - for_next_iter, for_next_iter);
+            NCI_Free(fd->io_buf);
+            fd->io_buf = (char *) NCI_Malloc(for_next_iter + coll_bufsize);
+            memcpy(fd->io_buf, tmp_buf, for_next_iter);
+            read_buf = fd->io_buf;
+            NCI_Free(tmp_buf);
+        }
+
+        off += size;
+        done += size;
+    }
+
+    for (i = 0; i < nprocs; i++)
+        count[i] = send_size[i] = 0;
+    for (m = ntimes; m < max_ntimes; m++) {
+        /* nothing to send, but check for recv. */
+        MPI_Aint recved_bytes = 0;
+        R_Exchange_data(fd, buf, buf_view, send_size, recv_size, count,
+                        start_pos, partial_send, recd_from_proc, nprocs,
+                        myrank, min_st_offset, fd_size, fd_start, fd_end,
+                        others_req, m, buf_idx, &recved_bytes);
+        actual_recved_bytes += recved_bytes;
+    }
+
+    NCI_Free(curr_offlen_ptr);
+
+    return actual_recved_bytes;
+}
+
+static void R_Exchange_data(PNCIO_File *fd, void *buf,
+                            PNCIO_View buf_view,
+                            MPI_Count * send_size, MPI_Count * recv_size,
+                            MPI_Count * count, MPI_Count * start_pos,
+                            MPI_Count * partial_send, MPI_Count * recd_from_proc, int nprocs,
+                            int myrank,
+                            MPI_Offset min_st_offset, MPI_Offset fd_size,
+                            MPI_Offset * fd_start, MPI_Offset * fd_end,
+                            PNCIO_Access * others_req, int iter,
+                            MPI_Aint * buf_idx, MPI_Aint * actual_recved_bytes)
+{
+    int i, nprocs_recv, nprocs_send;
+    char **recv_buf = NULL;
+    size_t memLen;
+    MPI_Request *requests;
+    MPI_Datatype send_type;
+    MPI_Status *statuses;
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    double curT = MPI_Wtime();
+#endif
+
+/* exchange send_size info so that each process knows how much to
+   receive from whom and how much memory to allocate. */
+
+    MPI_Alltoall(send_size, 1, MPI_COUNT, recv_size, 1, MPI_COUNT, fd->comm);
+
+    nprocs_recv = 0;
+    nprocs_send = 0;
+    memLen = 0;
+    for (i = 0; i < nprocs; i++) {
+        memLen += recv_size[i];
+        if (recv_size[i])
+            nprocs_recv++;
+        if (send_size[i])
+            nprocs_send++;
+    }
+
+    requests = (MPI_Request *)
+        NCI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Request));
+/* +1 to avoid a 0-size malloc */
+
+/* post recvs. if buf_view.is_contig, data can be directly recd. into
+   user buf at location given by buf_idx. else use recv_buf. */
+
+    MPI_Count j = 0; // think of this as a counter of non-zero sends/recs
+    if (buf_view.is_contig) {
+        for (i = 0; i < nprocs; i++) {
+            if (recv_size[i]) {
+#ifdef HAVE_MPI_LARGE_COUNT
+                MPI_Irecv_c(((char *) buf) + buf_idx[i], recv_size[i],
+                            MPI_BYTE, i, 0, fd->comm, requests + j);
+#else
+                MPI_Irecv(((char *) buf) + buf_idx[i], recv_size[i],
+                            MPI_BYTE, i, 0, fd->comm, requests + j);
+#endif
+                j++;
+                buf_idx[i] += recv_size[i];
+            }
+        }
+    } else {
+        /* allocate memory for recv_buf and post receives */
+        recv_buf = (char **) NCI_Malloc(nprocs * sizeof(char *));
+        recv_buf[0] = (char *) NCI_Malloc(memLen);
+        for (i = 1; i < nprocs; i++)
+            recv_buf[i] = recv_buf[i - 1] + recv_size[i - 1];
+
+        j = 0;
+        for (i = 0; i < nprocs; i++) {
+            if (recv_size[i]) {
+#ifdef HAVE_MPI_LARGE_COUNT
+                MPI_Irecv_c(recv_buf[i], recv_size[i], MPI_BYTE, i,
+                            0, fd->comm, requests + j);
+#else
+                MPI_Irecv(recv_buf[i], recv_size[i], MPI_BYTE, i,
+                            0, fd->comm, requests + j);
+#endif
+                j++;
+            }
+        }
+    }
+
+/* create derived datatypes and send data */
+
+    j = 0;
+    for (i = 0; i < nprocs; i++) {
+        if (send_size[i]) {
+            /* take care if the last off-len pair is a partial send */
+            MPI_Offset tmp = 0;
+            MPI_Count k = 0;
+            if (partial_send[i]) {
+                k = start_pos[i] + count[i] - 1;
+                tmp = others_req[i].lens[k];
+                others_req[i].lens[k] = partial_send[i];
+            }
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Type_create_hindexed_c(count[i],
+                                       &(others_req[i].lens[start_pos[i]]),
+                                       &(others_req[i].mem_ptrs[start_pos[i]]),
+                                       MPI_BYTE, &send_type);
+#else
+            MPI_Type_create_hindexed(count[i],
+                                     &(others_req[i].lens[start_pos[i]]),
+                                     &(others_req[i].mem_ptrs[start_pos[i]]),
+                                     MPI_BYTE, &send_type);
+#endif
+            /* absolute displacement; use MPI_BOTTOM in send */
+            MPI_Type_commit(&send_type);
+            MPI_Isend(MPI_BOTTOM, 1, send_type, i, 0,
+                      fd->comm, requests + nprocs_recv + j);
+            MPI_Type_free(&send_type);
+            if (partial_send[i])
+                others_req[i].lens[k] = tmp;
+            j++;
+        }
+    }
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    if (fd->is_agg) fd->read_timing[4] += MPI_Wtime() - curT;
+#endif
+
+
+    /* +1 to avoid a 0-size malloc */
+    statuses = (MPI_Status *) NCI_Malloc((nprocs_send + nprocs_recv + 1) * sizeof(MPI_Status));
+
+    /* wait on the receives */
+    if (nprocs_recv) {
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        curT = MPI_Wtime();
+#endif
+        MPI_Waitall(nprocs_recv, requests, statuses);
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        if (fd->is_agg) fd->read_timing[3] += MPI_Wtime() - curT;
+#endif
+
+        *actual_recved_bytes = 0;
+        j = 0;
+        for (i = 0; i < nprocs; i++) {
+            if (recv_size[i]) {
+#ifdef HAVE_MPI_LARGE_COUNT
+                MPI_Count count_recved;
+                MPI_Get_count_c(&statuses[j], MPI_BYTE, &count_recved);
+#else
+                int count_recved;
+                MPI_Get_count(&statuses[j], MPI_BYTE, &count_recved);
+#endif
+                *actual_recved_bytes += count_recved;
+                j++;
+            }
+        }
+
+        /* if noncontiguous, to the copies from the recv buffers */
+        if (!buf_view.is_contig)
+            Fill_user_buffer(fd, buf, buf_view, recv_buf, recv_size,
+                             recd_from_proc, nprocs, min_st_offset,
+                             fd_size, fd_start, fd_end);
+    }
+
+    /* wait on the sends */
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    curT = MPI_Wtime();
+#endif
+#ifdef HAVE_MPI_STATUSES_IGNORE
+    MPI_Waitall(nprocs_send, requests + nprocs_recv, MPI_STATUSES_IGNORE);
+#else
+    MPI_Waitall(nprocs_send, requests + nprocs_recv, statuses + nprocs_recv);
+#endif
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    if (fd->is_agg) fd->read_timing[3] += MPI_Wtime() - curT;
+#endif
+
+    NCI_Free(statuses);
+    NCI_Free(requests);
+
+    if (!buf_view.is_contig) {
+        NCI_Free(recv_buf[0]);
+        NCI_Free(recv_buf);
+    }
+}
+
+#define BUF_INCR {                                                  \
+    while (buf_incr) {                                              \
+        size_in_buf = MIN(buf_incr, flat_buf_sz);                   \
+        user_buf_idx += size_in_buf;                                \
+        flat_buf_sz -= size_in_buf;                                 \
+        buf_incr -= size_in_buf;                                    \
+        if (buf_incr > 0 && flat_buf_sz == 0) {                     \
+            flat_buf_idx++;                                         \
+            user_buf_idx = buf_view.off[flat_buf_idx];              \
+            flat_buf_sz = buf_view.len[flat_buf_idx];               \
+        }                                                           \
+    }                                                               \
+}
+
+
+#define BUF_COPY {                                                  \
+    while (size) {                                                  \
+        size_in_buf = MIN(size, flat_buf_sz);                       \
+        memcpy(((char *) buf) + user_buf_idx,                       \
+               &(recv_buf[p][recv_buf_idx[p]]), size_in_buf);       \
+        recv_buf_idx[p] += size_in_buf;                             \
+        user_buf_idx += size_in_buf;                                \
+        flat_buf_sz -= size_in_buf;                                 \
+        size -= size_in_buf;                                        \
+        buf_incr -= size_in_buf;                                    \
+        if (size > 0 && flat_buf_sz == 0) {                         \
+            flat_buf_idx++;                                         \
+            user_buf_idx = buf_view.off[flat_buf_idx];              \
+            flat_buf_sz = buf_view.len[flat_buf_idx];               \
+        }                                                           \
+    }                                                               \
+    BUF_INCR                                                        \
+}
+
+static void Fill_user_buffer(PNCIO_File *fd, void *buf,
+                             PNCIO_View buf_view,
+                             char **recv_buf,
+                             MPI_Count * recv_size,
+                             MPI_Count * recd_from_proc, int nprocs,
+                             MPI_Offset min_st_offset,
+                             MPI_Offset fd_size, MPI_Offset * fd_start,
+                             MPI_Offset * fd_end)
+{
+
+/* this function is only called if buftype is not contig */
+
+    int p, flat_buf_idx;
+    MPI_Offset flat_buf_sz, size_in_buf, buf_incr, size;
+    MPI_Offset off, user_buf_idx;
+    MPI_Offset len, rem_len;
+    MPI_Count *curr_from_proc, *done_from_proc, *recv_buf_idx;
+
+/*  curr_from_proc[p] = amount of data recd from proc. p that has already
+                        been accounted for so far
+    done_from_proc[p] = amount of data already recd from proc. p and
+                        filled into user buffer in previous iterations
+    user_buf_idx = current location in user buffer
+    recv_buf_idx[p] = current location in recv_buf of proc. p  */
+    /* combining these three related arrays into a single memory allocation
+     * (the "times 3" here) can help some highly noncontiguous workloads a bit */
+    curr_from_proc = NCI_Malloc(nprocs * 3 * sizeof(*curr_from_proc));
+    done_from_proc = curr_from_proc + nprocs;
+    recv_buf_idx = done_from_proc + nprocs;
+
+    for (int i = 0; i < nprocs; i++) {
+        recv_buf_idx[i] = curr_from_proc[i] = 0;
+        done_from_proc[i] = recd_from_proc[i];
+    }
+
+    user_buf_idx = buf_view.off[0];
+    flat_buf_idx = 0;
+    flat_buf_sz = buf_view.len[0];
+
+    /* flat_buf_idx = current index into flattened buftype
+     * flat_buf_sz = size of current contiguous component in
+     * flattened buf */
+
+    for (MPI_Count i = 0; i < fd->flat_file.count; i++) {
+        off = fd->flat_file.off[i];
+        rem_len = fd->flat_file.len[i];
+
+        /* this request may span the file domains of more than one process */
+        while (rem_len != 0) {
+            len = rem_len;
+            /* NOTE: len value is modified by PNCIO_Calc_aggregator() to be no
+             * longer than the single region that processor "p" is responsible
+             * for.
+             */
+            p = PNCIO_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_end);
+
+            if (recv_buf_idx[p] < recv_size[p]) {
+                if (curr_from_proc[p] + len > done_from_proc[p]) {
+                    if (done_from_proc[p] > curr_from_proc[p]) {
+                        size = MIN(curr_from_proc[p] + len - done_from_proc[p],
+                                   recv_size[p] - recv_buf_idx[p]);
+                        buf_incr = done_from_proc[p] - curr_from_proc[p];
+                        BUF_INCR
+                        buf_incr = curr_from_proc[p] + len - done_from_proc[p];
+                        curr_from_proc[p] = done_from_proc[p] + size;
+                        BUF_COPY
+                    } else {
+                        size = MIN(len, recv_size[p] - recv_buf_idx[p]);
+                        buf_incr = len;
+                        curr_from_proc[p] += size;
+                        BUF_COPY
+                    }
+                } else {
+                    curr_from_proc[p] += len;
+                    buf_incr = len;
+                    BUF_INCR
+                }
+            } else {
+                buf_incr = len;
+                BUF_INCR
+            }
+            off += len;
+            rem_len -= len;
+        }
+    }
+    for (int i = 0; i < nprocs; i++)
+        if (recv_size[i])
+            recd_from_proc[i] = curr_from_proc[i];
+
+    NCI_Free(curr_from_proc);
+}
diff --git a/src/drivers/pncio/pncio_read_str.c b/src/drivers/pncio/pncio_read_str.c
new file mode 100644
index 000000000..ae554c2fe
--- /dev/null
+++ b/src/drivers/pncio/pncio_read_str.c
@@ -0,0 +1,259 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <pncio.h>
+
+#define BUFFERED_READ {                                                       \
+    if (req_off >= readbuf_off + readbuf_len) {                               \
+        readbuf_off = req_off;                                                \
+        readbuf_len = MIN(max_bufsize, end_offset-readbuf_off+1);             \
+        r_len = PNCIO_ReadContig(fd, readbuf, readbuf_len, readbuf_off);      \
+        if (r_len < 0) return r_len;                                          \
+        total_r_len += r_len;                                                 \
+    }                                                                         \
+    while (req_len > readbuf_off + readbuf_len - req_off) {                   \
+        partial_read = readbuf_off + readbuf_len - req_off;                   \
+        tmp_buf = (char *) NCI_Malloc(partial_read);                          \
+        memcpy(tmp_buf, readbuf+readbuf_len-partial_read, partial_read);      \
+        NCI_Free(readbuf);                                                    \
+        readbuf = (char *) NCI_Malloc(partial_read + max_bufsize);            \
+        memcpy(readbuf, tmp_buf, partial_read);                               \
+        NCI_Free(tmp_buf);                                                    \
+        readbuf_off += readbuf_len-partial_read;                              \
+        readbuf_len = partial_read +                                          \
+                      MIN(max_bufsize, end_offset-readbuf_off+1);             \
+        r_len = PNCIO_ReadContig(fd, readbuf+partial_read,                    \
+                                 readbuf_len-partial_read,                    \
+                                 readbuf_off+partial_read);                   \
+        if (r_len < 0) return r_len;                                          \
+        total_r_len += r_len;                                                 \
+    }                                                                         \
+    memcpy((char*)buf+userbuf_off, readbuf+req_off-readbuf_off, req_len);     \
+}
+
+
+MPI_Offset PNCIO_GEN_ReadStrided(PNCIO_File *fd,
+                                 void       *buf,
+                                 PNCIO_View  buf_view,
+                                 MPI_Offset  offset)
+{
+    char *readbuf, *tmp_buf, *value;
+    int i, j, k, st_index=0, info_flag;
+
+    MPI_Aint max_bufsize, readbuf_len;
+    MPI_Offset i_offset, new_brd_size, brd_size, size, abs_off_in_filetype=0;
+    MPI_Offset new_frd_size, frd_size=0, st_frd_size, userbuf_off, req_len;
+    MPI_Offset sum, off, req_off, disp, end_offset=0, readbuf_off, start_off;
+    MPI_Offset r_len, total_r_len=0;
+    MPI_Count num, bufsize, partial_read;
+
+// printf("%s at %d:\n",__func__,__LINE__);
+
+    if (fd->hints->ds_read == PNCIO_HINT_DISABLE) {
+        /* if user has disabled data sieving on reads, use naive
+         * approach instead.
+         */
+        return PNCIO_GEN_ReadStrided_naive(fd, buf, buf_view, offset);
+    }
+
+/* This subroutine is entered with filetype being non-contiguous only */
+assert(fd->filetype == MPI_BYTE);
+if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */
+
+    bufsize = buf_view.size;
+
+    /* get max_bufsize from the info object. */
+    value = (char *) NCI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+    MPI_Info_get(fd->info, "ind_rd_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag);
+    max_bufsize = atoi(value);
+    NCI_Free(value);
+
+    if (!buf_view.is_contig && fd->flat_file.is_contig) {
+        /* noncontiguous in memory, contiguous in file. */
+
+        off = fd->disp + offset;
+
+        start_off = off;
+        end_offset = off + bufsize - 1;
+        readbuf_off = off;
+        readbuf = (char *) NCI_Malloc(max_bufsize);
+        readbuf_len = MIN(max_bufsize, end_offset - readbuf_off + 1);
+
+        /* if atomicity is true, lock (exclusive) the region to be accessed */
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
+
+        r_len = PNCIO_ReadContig(fd, readbuf, readbuf_len, readbuf_off);
+        if (r_len < 0) return r_len;
+
+        for (i = 0; i < buf_view.count; i++) {
+            userbuf_off = buf_view.off[i];
+            req_off = off;
+            req_len = buf_view.len[i];
+            BUFFERED_READ
+            off += buf_view.len[i];
+        }
+
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+        NCI_Free(readbuf);
+    }
+
+    else {      /* noncontiguous in file */
+        MPI_Offset size_in_filetype = offset;
+
+        disp = fd->disp;
+
+        sum = 0;
+        for (i = 0; i < fd->flat_file.count; i++) {
+            sum += fd->flat_file.len[i];
+            if (sum > size_in_filetype) {
+                st_index = i;
+                frd_size = sum - size_in_filetype;
+                abs_off_in_filetype = fd->flat_file.off[i] +
+                    size_in_filetype - (sum - fd->flat_file.len[i]);
+                break;
+            }
+        }
+
+        /* abs. offset in bytes in the file */
+        offset = disp + abs_off_in_filetype;
+
+        start_off = offset;
+
+        /* Wei-keng Liao: read request is within a single flat_file contig
+         * block e.g. with subarray types that actually describe the whole
+         * array */
+        if (buf_view.is_contig && bufsize <= frd_size) {
+            /* a count of bytes can overflow. operate on original type instead */
+            r_len = PNCIO_ReadContig(fd, buf, buf_view.size, offset);
+
+assert(buf_view.size == r_len);
+            return r_len;
+        }
+
+        /* Calculate end_offset, the last byte-offset that will be accessed.
+         * e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */
+
+        st_frd_size = frd_size;
+        i_offset = 0;
+        j = st_index;
+        off = offset;
+        frd_size = MIN(st_frd_size, bufsize);
+        while (i_offset < bufsize) {
+            i_offset += frd_size;
+            end_offset = off + frd_size - 1;
+
+if (i_offset >= bufsize) break;
+            j++;
+            off = disp + fd->flat_file.off[j];
+            frd_size = MIN(fd->flat_file.len[j], bufsize - i_offset);
+        }
+
+        /* if atomicity is true, lock (exclusive) the region to be accessed */
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
+
+        readbuf_off = 0;
+        readbuf_len = 0;
+        readbuf = (char *) NCI_Malloc(max_bufsize);
+
+        if (buf_view.is_contig && !fd->flat_file.is_contig) {
+            /* contiguous in memory, noncontiguous in file should be the most
+             * common case.
+             */
+            i_offset = 0;
+            j = st_index;
+            off = offset;
+            frd_size = MIN(st_frd_size, bufsize);
+            while (i_offset < bufsize) {
+                if (frd_size) {
+                    req_off = off;
+                    req_len = frd_size;
+                    userbuf_off = i_offset;
+                    BUFFERED_READ
+                }
+
+                i_offset += frd_size;
+                if (i_offset >= bufsize) break;
+
+                if (off + frd_size < disp + fd->flat_file.off[j] +
+                    fd->flat_file.len[j])
+                    off += frd_size; /* off is incremented by frd_size */
+                else {
+                    j++;
+assert(j < fd->flat_file.count);
+                    off = disp + fd->flat_file.off[j];
+                    frd_size = MIN(fd->flat_file.len[j],
+                                       bufsize - i_offset);
+                }
+            }
+        } else {
+            /* noncontiguous in memory as well as in file */
+            k = num = 0;
+            i_offset = buf_view.off[0];
+            j = st_index;
+            off = offset;
+            frd_size = st_frd_size;
+            brd_size = buf_view.len[0];
+
+            while (num < bufsize) {
+                size = MIN(frd_size, brd_size);
+                if (size) {
+                    req_off = off;
+                    req_len = size;
+                    userbuf_off = i_offset;
+                    BUFFERED_READ
+                }
+
+                num += size;
+                if (num >= bufsize) break;
+
+                new_frd_size = frd_size;
+                new_brd_size = brd_size;
+
+                if (size == frd_size) {
+                    /* reached end of contiguous block in file */
+                    j++;
+assert(j < fd->flat_file.count);
+                    off = disp + fd->flat_file.off[j];
+                    new_frd_size = fd->flat_file.len[j];
+                    if (size != brd_size) {
+                        i_offset += size;
+                        new_brd_size -= size;
+                    }
+                }
+
+                if (size == brd_size) {
+                    /* reached end of contiguous block in memory */
+                    k++;
+assert(k < buf_view.count);
+                    i_offset = buf_view.off[k];
+                    new_brd_size = buf_view.len[k];
+                    if (size != frd_size) {
+                        off += size;
+                        new_frd_size -= size;
+                    }
+                }
+                frd_size = new_frd_size;
+                brd_size = new_brd_size;
+            }
+        }
+
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+        NCI_Free(readbuf);    /* malloced in the buffered_read macro */
+    }
+
+    assert(total_r_len >= buf_view.size);
+
+    return buf_view.size;
+}
diff --git a/src/drivers/pncio/pncio_read_str_naive.c b/src/drivers/pncio/pncio_read_str_naive.c
new file mode 100644
index 000000000..fa003f403
--- /dev/null
+++ b/src/drivers/pncio/pncio_read_str_naive.c
@@ -0,0 +1,246 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <pncio.h>
+
+MPI_Offset PNCIO_GEN_ReadStrided_naive(PNCIO_File *fd,
+                                       void       *buf,
+                                       PNCIO_View  buf_view,
+                                       MPI_Offset  offset)
+{
+    int b_index;
+    MPI_Offset size, brd_size, frd_size=0, req_len, sum, off, req_off, disp;
+    MPI_Offset end_offset=0, start_off, abs_off_in_filetype=0, userbuf_off;
+    MPI_Offset r_len, total_r_len=0;
+    MPI_Count bufsize;
+
+// printf("%s at %d:\n",__func__,__LINE__);
+
+    if (fd->flat_file.size == 0)
+        return 0;
+
+    bufsize = buf_view.size;
+
+    /* contiguous in buftype and filetype is handled elsewhere */
+
+    if (!buf_view.is_contig && fd->flat_file.is_contig) {
+        /* noncontiguous in memory, contiguous in file. */
+
+        off = fd->disp + offset;
+
+        start_off = off;
+        end_offset = off + bufsize - 1;
+
+        /* if atomicity is true, lock (exclusive) the region to be accessed */
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
+
+        /* for each region in the buffer, grab the data and put it in place */
+        for (b_index = 0; b_index < buf_view.count; b_index++) {
+            userbuf_off = buf_view.off[b_index];
+            req_off = off;
+            req_len = buf_view.len[b_index];
+
+            r_len = PNCIO_ReadContig(fd, (char *) buf + userbuf_off,
+                                     req_len, req_off);
+            if (r_len < 0) return r_len;
+            total_r_len += r_len;
+
+            /* off is (potentially) used to save the final offset later */
+            off += buf_view.len[b_index];
+        }
+
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+    }
+    else {      /* noncontiguous in file */
+        MPI_Offset size_in_filetype = offset;
+
+        int f_index, st_index = 0;
+        MPI_Offset st_frd_size;
+
+        /* First we're going to calculate a set of values for use in all
+         * the noncontiguous in file cases:
+         * start_off - starting byte position of data in file
+         * end_offset - last byte offset to be accessed in the file
+         * st_index - index of block in first filetype that we will be
+         *            starting in (?)
+         * st_frd_size - size of the data in the first filetype block
+         *               that we will read (accounts for being part-way
+         *               into reading this block of the filetype
+         *
+         */
+
+        disp = fd->disp;
+
+        sum = 0;
+        for (f_index = 0; f_index < fd->flat_file.count; f_index++) {
+            sum += fd->flat_file.len[f_index];
+            if (sum > size_in_filetype) {
+                st_index = f_index;
+                frd_size = sum - size_in_filetype;
+                abs_off_in_filetype = fd->flat_file.off[f_index] +
+                    size_in_filetype - (sum - fd->flat_file.len[f_index]);
+                break;
+            }
+        }
+
+        /* abs. offset in bytes in the file */
+        start_off = disp + abs_off_in_filetype;
+
+        st_frd_size = frd_size;
+
+        /* start_off, st_index, and st_frd_size are
+         * all calculated at this point
+         */
+
+        /* Calculate end_offset, the last byte-offset that will be accessed.
+         * e.g., if start_off=0 and 100 bytes to be read, end_offset=99
+         */
+        f_index = st_index;
+        userbuf_off = frd_size = MIN(st_frd_size, bufsize);
+        end_offset = start_off + frd_size - 1;
+        while (userbuf_off < bufsize) {
+            f_index++;
+assert(f_index < fd->flat_file.count);
+
+            off = disp + fd->flat_file.off[f_index];
+            frd_size = MIN(fd->flat_file.len[f_index],
+                               bufsize - userbuf_off);
+            userbuf_off += frd_size;
+            end_offset = off + frd_size - 1;
+        }
+
+        /* End of calculations.  At this point the following values have
+         * been calculated and are ready for use:
+         * - start_off
+         * - end_offset
+         * - st_index
+         * - st_frd_size
+         */
+
+        /* if atomicity is true, lock (exclusive) the region to be accessed */
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
+
+        if (buf_view.is_contig && !fd->flat_file.is_contig) {
+            /* contiguous in memory, noncontiguous in file. should be the
+             * most common case.
+             */
+
+            userbuf_off = 0;
+            f_index = st_index;
+            off = start_off;
+            frd_size = MIN(st_frd_size, bufsize);
+
+            /* while there is still space in the buffer, read more data */
+            while (userbuf_off < bufsize) {
+                if (frd_size) {
+                    /* TYPE_UB and TYPE_LB can result in
+                     * frd_size = 0. save system call in such cases */
+                    req_off = off;
+                    req_len = frd_size;
+
+                    r_len = PNCIO_ReadContig(fd, (char *) buf + userbuf_off,
+                                             req_len, req_off);
+                    if (r_len < 0) return r_len;
+                    total_r_len += r_len;
+                }
+                userbuf_off += frd_size;
+                if (userbuf_off >= bufsize) break;
+
+                if (off + frd_size < disp + fd->flat_file.off[f_index] +
+                    fd->flat_file.len[f_index]) {
+                    /* important that this value be correct, as it is
+                     * used to set the offset in the fd near the end of
+                     * this function.
+                     */
+                    off += frd_size;
+                }
+                /* did not reach end of contiguous block in filetype.
+                 * no more I/O needed. off is incremented by frd_size.
+                 */
+                else {
+                    f_index++;
+assert(f_index < fd->flat_file.count);
+                    off = disp + fd->flat_file.off[f_index];
+                    frd_size = MIN(fd->flat_file.len[f_index],
+                                       bufsize - userbuf_off);
+                }
+            }
+        } else {
+            MPI_Offset i_offset, tmp_bufsize = 0;
+            /* noncontiguous in memory as well as in file */
+
+            b_index = 0;
+            i_offset = buf_view.off[0];
+            f_index = st_index;
+            off = start_off;
+            frd_size = st_frd_size;
+            brd_size = buf_view.len[0];
+
+            /* while we haven't read size * count bytes, keep going */
+            while (tmp_bufsize < bufsize) {
+                MPI_Offset new_brd_size = brd_size, new_frd_size = frd_size;
+
+                size = MIN(frd_size, brd_size);
+                /* keep max of a single read amount <= INT_MAX */
+                size = MIN(size, INT_MAX);
+
+                if (size) {
+                    req_off = off;
+                    req_len = size;
+                    userbuf_off = i_offset;
+
+                    r_len = PNCIO_ReadContig(fd, (char *) buf + userbuf_off,
+                                             req_len, req_off);
+                    if (r_len < 0) return r_len;
+                    total_r_len += r_len;
+                }
+
+                tmp_bufsize += size;
+                if (tmp_bufsize >= bufsize) break;
+
+                if (size == frd_size) {
+                    /* reached end of contiguous block in file */
+                    f_index++;
+assert(f_index < fd->flat_file.count);
+                    off = disp + fd->flat_file.off[f_index];
+
+                    new_frd_size = fd->flat_file.len[f_index];
+                    if (size != brd_size) {
+                        i_offset += size;
+                        new_brd_size -= size;
+                    }
+                }
+
+                if (size == brd_size) {
+                    /* reached end of contiguous block in memory */
+                    b_index++;
+assert(b_index < buf_view.count);
+                    i_offset = buf_view.off[b_index];
+                    new_brd_size = buf_view.len[b_index];
+                    if (size != frd_size) {
+                        off += size;
+                        new_frd_size -= size;
+                    }
+                }
+                frd_size = new_frd_size;
+                brd_size = new_brd_size;
+            }
+        }
+
+        /* unlock the file region if we locked it */
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+    }   /* end of (else noncontiguous in file) */
+
+    return total_r_len;
+}
diff --git a/src/drivers/pncio/pncio_set_size.c b/src/drivers/pncio/pncio_set_size.c
new file mode 100644
index 000000000..77490f481
--- /dev/null
+++ b/src/drivers/pncio/pncio_set_size.c
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>   /* strdup() */
+#include <assert.h>
+#include <sys/errno.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h> /* ftruncate(), lseek() */
+#endif
+
+#include <mpi.h>
+
+#include <pnc_debug.h>
+#include <common.h>
+#include "pncio.h"
+
+/*----< PNCIO_File_set_size() >-----------------------------------------------*/
+int PNCIO_File_set_size(PNCIO_File *fd,
+                        MPI_Offset  size)
+{
+    int err = NC_NOERR, rank;
+
+    MPI_Comm_rank(fd->comm, &rank);
+
+    if (rank == 0) {
+        err = ftruncate(fd->fd_sys, (off_t) size);
+        if (err != 0)
+            err = ncmpii_error_posix2nc("ftruncate");
+    }
+
+    MPI_Bcast(&err, 1, MPI_INT, 0, fd->comm);
+
+    return err;
+}
+
+/*----< PNCIO_File_get_size() >-----------------------------------------------*/
+int PNCIO_File_get_size(PNCIO_File *fd,
+                        MPI_Offset *size)
+{
+    int err = NC_NOERR, rank;
+    MPI_Offset msg[2];
+
+    MPI_Comm_rank(fd->comm, &rank);
+
+    if (rank == 0) {
+        *size = lseek(fd->fd_sys, 0, SEEK_END);
+        if (*size == -1)
+            err = ncmpii_error_posix2nc("lseek");
+        msg[0] = err;
+        msg[1] = *size;
+    }
+
+    MPI_Bcast(msg, 2, MPI_OFFSET, 0, fd->comm);
+    err = (int)msg[0];
+    *size = msg[1];
+
+    return err;
+}
+
diff --git a/src/drivers/pncio/pncio_set_view.c b/src/drivers/pncio/pncio_set_view.c
new file mode 100644
index 000000000..ddf41e968
--- /dev/null
+++ b/src/drivers/pncio/pncio_set_view.c
@@ -0,0 +1,67 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>   /* strdup() */
+#include <assert.h>
+#include <sys/errno.h>
+
+#include <mpi.h>
+
+#include <pnc_debug.h>
+#include <common.h>
+#include "pncio.h"
+
+/*----< PNCIO_File_set_view() >-----------------------------------------------*/
+/* For PnetCDF, this subroutine is an independent call, because PnetCDF only
+ * use the followings.
+ *   Argument etype is always MPI_BYTE.
+ *   Argument datarep is always "native".
+ *   Argument info is always MPI_INFO_NULL.
+ */
+int PNCIO_File_set_view(PNCIO_File   *fd,
+                        MPI_Offset    disp,
+                        MPI_Datatype  filetype,
+                        MPI_Aint      npairs,
+#ifdef HAVE_MPI_LARGE_COUNT
+                        MPI_Count    *offsets,
+                        MPI_Count    *lengths
+#else
+                        MPI_Offset   *offsets,
+                        int          *lengths
+#endif
+)
+{
+    MPI_Aint i;
+
+assert(filetype == MPI_BYTE);
+assert(disp == 0);
+fd->filetype = filetype;
+fd->disp = 0;
+
+    fd->flat_file.count = npairs;
+    fd->flat_file.off   = offsets;
+    fd->flat_file.len   = lengths;
+    fd->flat_file.idx   = 0;
+    fd->flat_file.rem   = (npairs > 0) ? lengths[0] : 0;
+
+    /* Size of fileview must be calculated here, as PnetCDF may coalesce the
+     * offset-length pairs in order to make offsets sorted in a monotonically
+     * non-decreasing order.
+     */
+    fd->flat_file.size = 0;
+    for (i=0; i<npairs; i++) fd->flat_file.size += lengths[i];
+
+    /* is_contig is redundant to (count <= 1), but convenient */
+    fd->flat_file.is_contig = (npairs <= 1);
+
+    return NC_NOERR;
+}
+
diff --git a/src/drivers/pncio/pncio_sync.c b/src/drivers/pncio/pncio_sync.c
new file mode 100644
index 000000000..49dc31bff
--- /dev/null
+++ b/src/drivers/pncio/pncio_sync.c
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>   /* strdup() */
+#include <assert.h>
+#include <sys/errno.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h> /* fsync(), unlink(), ftruncate(), lseek() */
+#endif
+
+#include <mpi.h>
+
+#include <pnc_debug.h>
+#include <common.h>
+#include "pncio.h"
+
+/*----< PNCIO_File_sync() >---------------------------------------------------*/
+int PNCIO_File_sync(PNCIO_File *fd)
+{
+    int err = NC_NOERR;
+
+    if (fd->is_open > 0) {
+        err = fsync(fd->fd_sys);
+        if (err != 0)
+            err = ncmpii_error_posix2nc("fsync");
+    }
+
+    return err;
+}
+
diff --git a/src/drivers/pncio/pncio_utils.c b/src/drivers/pncio/pncio_utils.c
new file mode 100644
index 000000000..c4c1629e1
--- /dev/null
+++ b/src/drivers/pncio/pncio_utils.c
@@ -0,0 +1,185 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <limits.h>
+#include <stdarg.h> /* va_start(), va_end() */
+
+#include <pncio.h>
+
+/* some systems do not have pread/pwrite, or requrie XOPEN_SOURCE set higher
+ * than we would like.  see #1973 */
+#if (HAVE_DECL_PWRITE == 0)
+
+#include <sys/types.h>
+#include <unistd.h>
+
+ssize_t pread(int fd, void *buf, size_t count, off_t offset);
+ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset);
+
+ssize_t pread(int fd, void *buf, size_t count, off_t offset)
+{
+    off_t lseek_ret;
+    off_t old_offset;
+    ssize_t read_ret;
+
+    old_offset = lseek(fd, 0, SEEK_CUR);
+    lseek_ret = lseek(fd, offset, SEEK_SET);
+    if (lseek_ret == -1)
+        return lseek_ret;
+    read_ret = read(fd, buf, count);
+    if (read_ret < 0)
+        return read_ret;
+    /* man page says "file offset is not changed" */
+    if ((lseek_ret = lseek(fd, old_offset, SEEK_SET)) < 0)
+        return lseek_ret;
+
+    return read_ret;
+}
+
+ssize_t pwrite(int fd, const void *buf, size_t count, off_t offset)
+{
+    off_t lseek_ret;
+    off_t old_offset;
+    ssize_t write_ret;
+
+    old_offset = lseek(fd, 0, SEEK_CUR);
+    lseek_ret = lseek(fd, offset, SEEK_SET);
+    if (lseek_ret == -1)
+        return lseek_ret;
+    write_ret = write(fd, buf, count);
+    if (write_ret < 0)
+        return write_ret;
+    /* man page says "file offset is not changed" */
+    if ((lseek_ret = lseek(fd, old_offset, SEEK_SET)) < 0)
+        return lseek_ret;
+
+    return write_ret;
+}
+#endif
+
+void PNCIO_Heap_merge(PNCIO_Access * others_req, MPI_Count * count,
+                      MPI_Offset * srt_off, MPI_Count * srt_len, MPI_Count * start_pos,
+                      int nprocs, int nprocs_recv, MPI_Count total_elements)
+{
+    typedef struct {
+        MPI_Offset *off_list;
+#ifdef HAVE_MPI_LARGE_COUNT
+        MPI_Offset *len_list;
+#else
+        int *len_list;
+#endif
+        MPI_Count nelem;
+    } heap_struct;
+
+    heap_struct *a, tmp;
+    int i, j, heapsize, l, r, k, smallest;
+
+    a = (heap_struct *) NCI_Malloc((nprocs_recv + 1) * sizeof(heap_struct));
+
+    j = 0;
+    for (i = 0; i < nprocs; i++)
+        if (count[i]) {
+            a[j].off_list = &(others_req[i].offsets[start_pos[i]]);
+            a[j].len_list = &(others_req[i].lens[start_pos[i]]);
+            a[j].nelem = count[i];
+            j++;
+        }
+
+    /* build a heap out of the first element from each list, with
+     * the smallest element of the heap at the root */
+
+    heapsize = nprocs_recv;
+    for (i = heapsize / 2 - 1; i >= 0; i--) {
+        /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143
+         * modified for a heap with smallest element at root. I have
+         * removed the recursion so that there are no function calls.
+         * Function calls are too expensive. */
+        k = i;
+        for (;;) {
+            l = 2 * (k + 1) - 1;
+            r = 2 * (k + 1);
+
+            if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
+                smallest = l;
+            else
+                smallest = k;
+
+            if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list)))
+                smallest = r;
+
+            if (smallest != k) {
+                tmp.off_list = a[k].off_list;
+                tmp.len_list = a[k].len_list;
+                tmp.nelem = a[k].nelem;
+
+                a[k].off_list = a[smallest].off_list;
+                a[k].len_list = a[smallest].len_list;
+                a[k].nelem = a[smallest].nelem;
+
+                a[smallest].off_list = tmp.off_list;
+                a[smallest].len_list = tmp.len_list;
+                a[smallest].nelem = tmp.nelem;
+
+                k = smallest;
+            } else
+                break;
+        }
+    }
+
+    for (i = 0; i < total_elements; i++) {
+        /* extract smallest element from heap, i.e. the root */
+        srt_off[i] = *(a[0].off_list);
+        srt_len[i] = *(a[0].len_list);
+        (a[0].nelem)--;
+
+        if (!a[0].nelem) {
+            a[0].off_list = a[heapsize - 1].off_list;
+            a[0].len_list = a[heapsize - 1].len_list;
+            a[0].nelem = a[heapsize - 1].nelem;
+            heapsize--;
+        } else {
+            (a[0].off_list)++;
+            (a[0].len_list)++;
+        }
+
+        /* Heapify(a, 0, heapsize); */
+        k = 0;
+        for (;;) {
+            l = 2 * (k + 1) - 1;
+            r = 2 * (k + 1);
+
+            if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list)))
+                smallest = l;
+            else
+                smallest = k;
+
+            if ((r < heapsize) && (*(a[r].off_list) < *(a[smallest].off_list)))
+                smallest = r;
+
+            if (smallest != k) {
+                tmp.off_list = a[k].off_list;
+                tmp.len_list = a[k].len_list;
+                tmp.nelem = a[k].nelem;
+
+                a[k].off_list = a[smallest].off_list;
+                a[k].len_list = a[smallest].len_list;
+                a[k].nelem = a[smallest].nelem;
+
+                a[smallest].off_list = tmp.off_list;
+                a[smallest].len_list = tmp.len_list;
+                a[smallest].nelem = tmp.nelem;
+
+                k = smallest;
+            } else
+                break;
+        }
+    }
+    NCI_Free(a);
+}
+
diff --git a/src/drivers/pncio/pncio_write.c b/src/drivers/pncio/pncio_write.c
new file mode 100644
index 000000000..debd07e1d
--- /dev/null
+++ b/src/drivers/pncio/pncio_write.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (C) 2025, Northwestern University
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <sys/errno.h>
+#include <unistd.h>   /* pwrite() */
+
+#include <mpi.h>
+
+#include "pncio.h"
+
+#ifdef WKL_DEBUG
+int first_ost_id;
+#endif
+
+/*----< PNCIO_WriteContig() >-------------------------------------------------*/
+MPI_Offset PNCIO_WriteContig(PNCIO_File *fd,
+                             const void *buf,
+                             MPI_Offset  w_size,
+                             MPI_Offset  offset)
+{
+    ssize_t err = 0;
+    size_t w_count;
+    MPI_Offset bytes_xfered = 0;
+    char *p;
+
+    if (w_size == 0) return NC_NOERR;
+
+// printf("%s at %d: pwrite offset=%lld w_size=%lld\n",__func__,__LINE__,offset,w_size);
+#ifdef WKL_DEBUG
+int rank; MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+
+MPI_Offset ost_id = (offset / fd->hints->striping_unit) % fd->hints->striping_factor;
+    if (first_ost_id == -1) {
+        first_ost_id = ost_id;
+        // printf("%2d %s file %s First pwrite offset=%lld OST %d\n",rank,__func__,fd->filename,offset,first_ost_id);
+    }
+    else if (ost_id != first_ost_id)
+        printf("%2d Error: %s pwrite offset=%lld w_size=%lld ost_id=%lld not same 1st ost %d\n",rank,__func__,offset,w_size,ost_id,first_ost_id);
+
+printf("%s line %d: disp=%lld offset=%lld count=%ld bufType_size=%d w_size=%lld\n",__func__,__LINE__,fd->disp,offset,count,bufType_size,w_size);
+
+    printf("%2d %s line %d pwrite offset=%lld w_size=%lld\n",rank,__func__,__LINE__,offset,w_size);
+#endif
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    double timing = MPI_Wtime();
+#endif
+    p = (char *) buf;
+    while (bytes_xfered < w_size) {
+        w_count = w_size - bytes_xfered;
+        err = pwrite(fd->fd_sys, p, w_count, offset + bytes_xfered);
+        if (err == -1)
+            goto ioerr;
+        if (err == 0)
+            break;
+        bytes_xfered += err;
+        p += err;
+    }
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    fd->write_timing[2] += MPI_Wtime() - timing;
+#endif
+
+ioerr:
+    if (err == -1)
+        bytes_xfered = ncmpii_error_posix2nc("pwrite");
+
+    return bytes_xfered;
+}
+
+/*----< file_write() >-------------------------------------------------------*/
+/* This is an independent call. */
+static
+MPI_Offset file_write(PNCIO_File *fd,
+                      MPI_Offset  offset,
+                      const void *buf,
+                      PNCIO_View  buf_view)
+{
+    MPI_Offset w_len;
+
+    if (buf_view.size == 0) /* zero-sized request */
+        return NC_NOERR;
+
+assert(fd->filetype == MPI_BYTE);
+
+    if (buf_view.is_contig && fd->flat_file.is_contig) {
+        if (fd->flat_file.count > 0) offset += fd->flat_file.off[0];
+        w_len = PNCIO_WriteContig(fd, buf, buf_view.size, offset);
+    }
+    else if (fd->file_system == PNCIO_LUSTRE)
+        w_len = PNCIO_LUSTRE_WriteStrided(fd, buf, buf_view, offset);
+    else if (fd->file_system == PNCIO_UFS)
+        w_len = PNCIO_GEN_WriteStrided(fd, buf, buf_view, offset);
+    else
+        return NC_EFSTYPE;
+
+    return w_len; /* when w_len < 0, it is an NetCDF error code */
+}
+
+/*----< PNCIO_File_write_at() >-----------------------------------------------*/
+/* This is an independent call.
+ * offset is a position in the file relative to the current view, expressed as
+ * a count of etypes.
+ */
+MPI_Offset PNCIO_File_write_at(PNCIO_File *fh,
+                               MPI_Offset  offset,
+                               const void *buf,
+                               PNCIO_View  buf_view)
+{
+    assert(fh != NULL);
+
+    if (buf_view.size == 0) /* zero-sized request */
+        return NC_NOERR;
+
+    if (buf_view.size < 0) return NC_ENEGATIVECNT;
+
+    if (fh->access_mode & MPI_MODE_RDONLY)
+        return NC_EPERM;
+
+    return file_write(fh, offset, buf, buf_view);
+}
+
+/*----< PNCIO_File_write_at_all() >-------------------------------------------*/
+/* This is a collective call.
+ * offset is a position in the file relative to the current view, expressed as
+ * a count of etypes.
+ */
+MPI_Offset PNCIO_File_write_at_all(PNCIO_File *fh,
+                                   MPI_Offset  offset,
+                                   const void *buf,
+                                   PNCIO_View  buf_view)
+{
+    int err=NC_NOERR;
+    MPI_Offset w_len;
+
+    assert(fh != NULL);
+
+    if (buf_view.size < 0) err = NC_ENEGATIVECNT;
+
+    if (fh->access_mode & MPI_MODE_RDONLY && err == NC_NOERR)
+        err = NC_EPERM;
+
+    if (fh->file_system == PNCIO_LUSTRE)
+        w_len = PNCIO_LUSTRE_WriteStridedColl(fh, buf, buf_view, offset);
+    else if (fh->file_system == PNCIO_UFS)
+        w_len = PNCIO_GEN_WriteStridedColl(fh, buf, buf_view, offset);
+    else
+        return NC_EFSTYPE;
+
+    return (err == NC_NOERR) ? w_len : err;
+}
+
+
diff --git a/src/drivers/pncio/pncio_write_coll.c b/src/drivers/pncio/pncio_write_coll.c
new file mode 100644
index 000000000..d6126c5ce
--- /dev/null
+++ b/src/drivers/pncio/pncio_write_coll.c
@@ -0,0 +1,898 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include "pncio.h"
+
+/* prototypes of functions used for collective writes only. */
+static MPI_Offset Exch_and_write(PNCIO_File *fd, void *buf,
+                           PNCIO_View buf_view, int nprocs, int myrank,
+                           PNCIO_Access *others_req,
+                           MPI_Offset min_st_offset, MPI_Offset fd_size,
+                           MPI_Offset * fd_start, MPI_Offset * fd_end,
+                           MPI_Aint * buf_idx);
+
+static MPI_Offset W_Exchange_data(PNCIO_File *fd, void *buf, char *write_buf,
+                            PNCIO_View buf_view,
+                            MPI_Count * send_size, MPI_Count * recv_size,
+                            MPI_Offset off, MPI_Count size,   /* 10 */
+                            MPI_Count * count, MPI_Count * start_pos,
+                            MPI_Count * partial_recv, MPI_Count *
+                            sent_to_proc, int nprocs,
+                            int myrank,
+                            MPI_Offset min_st_offset, MPI_Offset fd_size,
+                            MPI_Offset * fd_start, MPI_Offset * fd_end,
+                            PNCIO_Access * others_req,
+                            MPI_Count *send_buf_idx, MPI_Count *curr_to_proc,
+                            MPI_Count *done_to_proc, int *hole, int iter,
+                            MPI_Aint * buf_idx);
+
+static void Fill_send_buffer(PNCIO_File *fd, void *buf,
+                             PNCIO_View buf_view, char **send_buf,
+                             MPI_Count *send_size, MPI_Request *requests,
+                             MPI_Count *sent_to_proc, int nprocs, int myrank,
+                             MPI_Offset min_st_offset,
+                             MPI_Offset fd_size, MPI_Offset *fd_start,
+                             MPI_Offset *fd_end, MPI_Count *send_buf_idx,
+                             MPI_Count *curr_to_proc, MPI_Count *done_to_proc, int iter);
+
+MPI_Offset PNCIO_GEN_WriteStridedColl(PNCIO_File *fd,
+                                      const void *buf,
+                                      PNCIO_View  buf_view,
+                                      MPI_Offset  offset) /* relative to fileview */
+{
+    /* Uses a generalized version of the extended two-phase method described in
+     * "An Extended Two-Phase Method for Accessing Sections of Out-of-Core
+     * Arrays", Rajeev Thakur and Alok Choudhary, Scientific Programming,
+     * (5)4:301--317, Winter 1996.
+     * http://www.mcs.anl.gov/home/thakur/ext2ph.ps
+     */
+
+    PNCIO_Access *my_req;
+    /* array of nprocs access structures, one for each other process in
+     * whose file domain this process's request lies */
+
+    PNCIO_Access *others_req;
+    /* array of nprocs access structures, one for each other process
+     * whose request lies in this process's file domain. */
+
+    int i, nprocs, nprocs_for_coll, myrank, interleave_count=0;
+    MPI_Aint *buf_idx = NULL;
+    MPI_Count *count_my_req_per_proc, count_my_req_procs;
+    MPI_Count *count_others_req_per_proc, count_others_req_procs;
+    MPI_Offset start_offset, end_offset, fd_size, min_st_offset;
+    MPI_Offset *st_offsets=NULL, *fd_start=NULL;
+    MPI_Offset *fd_end=NULL, *end_offsets=NULL, w_len=0;
+
+// printf("%s at %d: offset=%lld buf_view.size=%lld\n",__func__,__LINE__, offset,buf_view.size);
+
+    MPI_Comm_size(fd->comm, &nprocs);
+    MPI_Comm_rank(fd->comm, &myrank);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+double curT = MPI_Wtime();
+#endif
+
+    /* the number of processes that actually perform I/O, nprocs_for_coll, is
+     * stored in the hints off the PNCIO_File structure
+     */
+    nprocs_for_coll = fd->hints->cb_nodes;
+
+    /* only check for interleaving if cb_write isn't disabled */
+    if (fd->hints->cb_write != PNCIO_HINT_DISABLE) {
+        /* For this process's request, calculate the file start and end
+         * offsets. Note: end_offset points to the last byte-offset that will
+         * be accessed, e.g., if start_offset=0 and 100 bytes to be read,
+         * end_offset=99.
+         */
+        if (fd->flat_file.size == 0) {
+            start_offset = 0;
+            end_offset = -1;
+        }
+        else if (fd->flat_file.count > 0) {
+            start_offset = offset + fd->flat_file.off[0];
+            end_offset   = fd->flat_file.off[fd->flat_file.count-1]
+                         + fd->flat_file.len[fd->flat_file.count-1] - 1;
+        }
+        else {
+            start_offset = offset;
+            end_offset   = offset + fd->flat_file.size - 1;
+        }
+
+        /* Each process communicates its start and end offsets to other
+         * processes. The result is an array each of start and end offsets
+         * stored in order of process rank.
+         */
+
+        st_offsets = (MPI_Offset *) NCI_Malloc(nprocs * 2 * sizeof(MPI_Offset));
+        end_offsets = st_offsets + nprocs;
+
+        MPI_Allgather(&start_offset, 1, MPI_OFFSET, st_offsets, 1, MPI_OFFSET,
+                      fd->comm);
+        MPI_Allgather(&end_offset, 1, MPI_OFFSET, end_offsets, 1, MPI_OFFSET,
+                      fd->comm);
+
+        /* Are the accesses of different processes interleaved? Below is a
+         * rudimentary check for interleaving, but should suffice for the
+         * moment.
+         */
+        for (i = 1; i < nprocs; i++)
+            if (st_offsets[i] < end_offsets[i - 1] &&
+                st_offsets[i] <= end_offsets[i])
+                interleave_count++;
+    }
+
+    if (fd->hints->cb_write == PNCIO_HINT_DISABLE ||
+        (!interleave_count && (fd->hints->cb_write == PNCIO_HINT_AUTO))) {
+
+        /* use independent accesses */
+        if (fd->hints->cb_write != PNCIO_HINT_DISABLE)
+            NCI_Free(st_offsets);
+        if (buf_view.size == 0) return 0;
+
+        /* offset is relative to fileview */
+if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */
+
+        if (buf_view.is_contig && fd->flat_file.is_contig) {
+            if (fd->flat_file.count > 0) offset += fd->flat_file.off[0];
+            w_len = PNCIO_WriteContig(fd, buf, buf_view.size, offset);
+        }
+        else
+            w_len = PNCIO_GEN_WriteStrided(fd, buf, buf_view, offset);
+
+        return w_len;
+    }
+
+// printf("%s at %d:\n",__func__,__LINE__);
+/* Divide the I/O workload among "nprocs_for_coll" processes. This is
+   done by (logically) dividing the file into file domains (FDs); each
+   process may directly access only its own file domain. */
+
+    PNCIO_Calc_file_domains(st_offsets, end_offsets, nprocs, nprocs_for_coll,
+                            &min_st_offset, &fd_start, &fd_end, &fd_size,
+                            fd->hints->striping_unit);
+
+/* calculate what portions of the access requests of this process are
+   located in what file domains */
+
+    PNCIO_Calc_my_req(fd, min_st_offset, fd_start, fd_end, fd_size, nprocs,
+                      &count_my_req_procs, &count_my_req_per_proc, &my_req,
+                      &buf_idx);
+
+/* based on everyone's my_req, calculate what requests of other
+   processes lie in this process's file domain.
+   count_others_req_procs = number of processes whose requests lie in
+   this process's file domain (including this process itself)
+   count_others_req_per_proc[i] indicates how many separate contiguous
+   requests of proc. i lie in this process's file domain. */
+
+    PNCIO_Calc_others_req(fd, count_my_req_procs, count_my_req_per_proc,
+                          my_req, nprocs, myrank, &count_others_req_procs,
+                          &count_others_req_per_proc, &others_req);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    if (fd->is_agg) fd->write_timing[1] += MPI_Wtime() - curT;
+#endif
+
+/* exchange data and write in sizes of no more than coll_bufsize. */
+    /* Cast away const'ness for the below function */
+    w_len = Exch_and_write(fd, (char *) buf, buf_view, nprocs, myrank,
+                           others_req, min_st_offset, fd_size, fd_start,
+                           fd_end, buf_idx);
+
+    /* If this collective write is followed by an independent write,
+     * it's possible to have those subsequent writes on other processes
+     * race ahead and sneak in before the read-modify-write completes.
+     * We carry out a collective communication at the end here so no one
+     * can start independent i/o before collective I/O completes.
+     *
+     * need to do some gymnastics with the error codes so that if something
+     * went wrong, all processes report error, but if a process has a more
+     * specific error code, we can still have that process report the
+     * additional information */
+
+    /* optimization: if only one process performing i/o, we can perform
+     * a less-expensive Bcast
+     */
+    if (fd->hints->cb_nodes == 1)
+        MPI_Bcast(&w_len, 1, MPI_OFFSET, fd->hints->ranklist[0], fd->comm);
+    else
+        MPI_Allreduce(MPI_IN_PLACE, &w_len, 1, MPI_OFFSET, MPI_MIN, fd->comm);
+
+    /* free all memory allocated for collective I/O */
+    PNCIO_Free_my_req(count_my_req_per_proc, my_req, buf_idx);
+    PNCIO_Free_others_req(count_others_req_per_proc, others_req);
+
+    NCI_Free(st_offsets);
+    NCI_Free(fd_start);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    if (fd->is_agg) fd->write_timing[0] += MPI_Wtime() - curT;
+#endif
+
+    /* w_len may not be the same as buf_view.size, because data sieving may
+     * write more than requested.
+     */
+    return buf_view.size;
+}
+
+/* If successful, it returns the amount written. Otherwise a NetCDF error code
+ * (negative value) is returned.
+ */
+static
+MPI_Offset Exch_and_write(PNCIO_File *fd, void *buf, PNCIO_View buf_view,
+                          int nprocs,
+                          int myrank,
+                          PNCIO_Access *others_req,
+                          MPI_Offset min_st_offset, MPI_Offset fd_size,
+                          MPI_Offset * fd_start, MPI_Offset * fd_end,
+                          MPI_Aint * buf_idx)
+{
+/* Send data to appropriate processes and write in sizes of no more
+   than coll_bufsize.
+   The idea is to reduce the amount of extra memory required for
+   collective I/O. If all data were written all at once, which is much
+   easier, it would require temp space more than the size of user_buf,
+   which is often unacceptable. For example, to write a distributed
+   array to a file, where each local array is 8Mbytes, requiring
+   at least another 8Mbytes of temp space is unacceptable. */
+
+    /* Not convinced end_loc-st_loc couldn't be > int, so make these offsets */
+    MPI_Offset size=0, w_len, total_w_len=0;
+    int hole, i, m, ntimes, max_ntimes;
+    MPI_Offset st_loc = -1, end_loc = -1, off, done, req_off;
+    char *write_buf = NULL;
+    MPI_Count *curr_offlen_ptr, *send_size, *count, req_len, *recv_size;
+    MPI_Count *partial_recv, *sent_to_proc, *start_pos;
+    int flag;
+    MPI_Count *send_buf_idx, *curr_to_proc, *done_to_proc;
+    int info_flag;
+    MPI_Aint coll_bufsize;
+    char *value;
+
+    /* only I/O errors are currently reported */
+
+/* calculate the number of writes of size coll_bufsize
+   to be done by each process and the max among all processes.
+   That gives the no. of communication phases as well. */
+
+    value = (char *) NCI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+    MPI_Info_get(fd->info, "cb_buffer_size", MPI_MAX_INFO_VAL, value, &info_flag);
+    coll_bufsize = atoi(value);
+    NCI_Free(value);
+
+
+    for (i = 0; i < nprocs; i++) {
+        if (others_req[i].count) {
+            st_loc = others_req[i].offsets[0];
+            end_loc = others_req[i].offsets[0];
+            break;
+        }
+    }
+
+    for (i = 0; i < nprocs; i++)
+        for (MPI_Count j = 0; j < others_req[i].count; j++) {
+            st_loc = MIN(st_loc, others_req[i].offsets[j]);
+            end_loc = MAX(end_loc, (others_req[i].offsets[j]
+                                        + others_req[i].lens[j] - 1));
+        }
+
+/* ntimes=ceiling_div(end_loc - st_loc + 1, coll_bufsize)*/
+
+    ntimes = (int) ((end_loc - st_loc + coll_bufsize) / coll_bufsize);
+
+    if ((st_loc == -1) && (end_loc == -1)) {
+        ntimes = 0;     /* this process does no writing. */
+    }
+
+    MPI_Allreduce(&ntimes, &max_ntimes, 1, MPI_INT, MPI_MAX, fd->comm);
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    fd->write_counter[0] = MAX(fd->write_counter[0], max_ntimes);
+#endif
+
+    write_buf = fd->io_buf;
+
+    curr_offlen_ptr = NCI_Calloc(nprocs * 10, sizeof(*curr_offlen_ptr));
+    /* its use is explained below. calloc initializes to 0. */
+
+    count = curr_offlen_ptr + nprocs;
+    /* to store count of how many off-len pairs per proc are satisfied
+     * in an iteration. */
+
+    partial_recv = count + nprocs;
+    /* if only a portion of the last off-len pair is recd. from a process
+     * in a particular iteration, the length recd. is stored here.
+     * calloc initializes to 0. */
+
+    send_size = partial_recv + nprocs;
+    /* total size of data to be sent to each proc. in an iteration.
+     * Of size nprocs so that I can use MPI_Alltoall later. */
+
+    recv_size = send_size + nprocs;
+    /* total size of data to be recd. from each proc. in an iteration. */
+
+    sent_to_proc = recv_size + nprocs;
+    /* amount of data sent to each proc so far. Used in
+     * Fill_send_buffer. initialized to 0 here. */
+
+    send_buf_idx = sent_to_proc + nprocs;
+    curr_to_proc = send_buf_idx + nprocs;
+    done_to_proc = curr_to_proc + nprocs;
+    /* Above three are used in Fill_send_buffer */
+
+    start_pos = done_to_proc + nprocs;
+    /* used to store the starting value of curr_offlen_ptr[i] in
+     * this iteration */
+
+    done = 0;
+    off = st_loc;
+// printf("%s at %d: off=%lld buf_view.size=%lld ntimes=%d\n",__func__,__LINE__, off,buf_view.size,ntimes);
+
+    for (m = 0; m < ntimes; m++) {
+        /* go through all others_req and check which will be satisfied
+         * by the current write */
+
+        /* Note that MPI guarantees that displacements in filetypes are in
+         * monotonically nondecreasing order and that, for writes, the
+         * filetypes cannot specify overlapping regions in the file. This
+         * simplifies implementation a bit compared to reads. */
+
+        /* off = start offset in the file for the data to be written in
+         * this iteration
+         * size = size of data written (bytes) corresponding to off
+         * req_off = off in file for a particular contiguous request
+         * minus what was satisfied in previous iteration
+         * req_size = size corresponding to req_off */
+
+        /* first calculate what should be communicated */
+
+        for (i = 0; i < nprocs; i++)
+            count[i] = recv_size[i] = 0;
+
+        size = MIN(coll_bufsize, end_loc - st_loc + 1 - done);
+
+        for (i = 0; i < nprocs; i++) {
+            if (others_req[i].count) {
+                start_pos[i] = curr_offlen_ptr[i];
+                MPI_Count j;
+                for (j = curr_offlen_ptr[i]; j < others_req[i].count; j++) {
+                    if (partial_recv[i]) {
+                        /* this request may have been partially
+                         * satisfied in the previous iteration. */
+                        req_off = others_req[i].offsets[j] + partial_recv[i];
+                        req_len = others_req[i].lens[j] - partial_recv[i];
+                        partial_recv[i] = 0;
+                        /* modify the off-len pair to reflect this change */
+                        others_req[i].offsets[j] = req_off;
+                        others_req[i].lens[j] = req_len;
+                    } else {
+                        req_off = others_req[i].offsets[j];
+                        req_len = others_req[i].lens[j];
+                    }
+                    if (req_off < off + size) {
+                        count[i]++;
+                        if (myrank != i) {
+                            MPI_Aint addr;
+                            MPI_Get_address(write_buf + req_off - off, &addr);
+                            others_req[i].mem_ptrs[j] = addr;
+                        }
+                        else
+                            others_req[i].mem_ptrs[j] = req_off - off;
+                        recv_size[i] += MIN(off + size - req_off, req_len);
+
+                        if (off + size - req_off < req_len) {
+                            partial_recv[i] = (off + size - req_off);
+
+                            /* --BEGIN ERROR HANDLING-- */
+                            if ((j + 1 < others_req[i].count) &&
+                                (others_req[i].offsets[j + 1] < off + size)) {
+                                /* This error should not happen to PnetCDF, as
+                                 * fileview is checked before entering this
+                                 * subroutine.
+                                 */
+                                fprintf(stderr, "Filetype specifies overlapping write regions (which is illegal according to the MPI-2 specification\n");
+                                /* allow to continue since additional
+                                 * communication might have to occur
+                                 */
+                                return NC_EFILE;
+                            }
+                            /* --END ERROR HANDLING-- */
+                            break;
+                        }
+                    } else
+                        break;
+                }
+                curr_offlen_ptr[i] = j;
+            }
+        }
+
+        w_len = W_Exchange_data(fd, buf, write_buf, buf_view, send_size,
+                                recv_size, off, size, count, start_pos,
+                                partial_recv, sent_to_proc, nprocs, myrank,
+                                min_st_offset, fd_size, fd_start, fd_end,
+                                others_req, send_buf_idx, curr_to_proc,
+                                done_to_proc, &hole, m, buf_idx);
+
+        if (w_len < 0)
+            return w_len;
+        else
+            total_w_len += w_len;
+
+        flag = 0;
+        for (i = 0; i < nprocs; i++)
+            if (count[i])
+                flag = 1;
+
+        if (flag) {
+            w_len = PNCIO_WriteContig(fd, write_buf, size, off);
+            if (w_len < 0)
+                return w_len;
+            else
+                total_w_len += w_len;
+        }
+
+        off += size;
+        done += size;
+    }
+
+    for (i = 0; i < nprocs; i++)
+        count[i] = recv_size[i] = 0;
+    for (m = ntimes; m < max_ntimes; m++) {
+        /* nothing to recv, but check for send. */
+        w_len = W_Exchange_data(fd, buf, write_buf, buf_view, send_size,
+                                recv_size, off, size, count, start_pos,
+                                partial_recv, sent_to_proc, nprocs, myrank,
+                                min_st_offset, fd_size, fd_start, fd_end,
+                                others_req, send_buf_idx, curr_to_proc,
+                                done_to_proc, &hole, m, buf_idx);
+        if (w_len < 0)
+            return w_len;
+        else
+            total_w_len += w_len;
+    }
+
+    NCI_Free(curr_offlen_ptr);
+
+    return total_w_len;
+}
+
+
+/* Sets error_code to MPI_SUCCESS if successful, or creates an error code
+ * in the case of error.
+ */
+static
+MPI_Offset W_Exchange_data(PNCIO_File *fd, void *buf, char *write_buf,
+                           PNCIO_View buf_view,
+                           MPI_Count *send_size, MPI_Count *recv_size,
+                           MPI_Offset off, MPI_Count size,
+                           MPI_Count *count, MPI_Count * start_pos,
+                           MPI_Count *partial_recv,
+                           MPI_Count *sent_to_proc, int nprocs,
+                           int myrank,
+                           MPI_Offset min_st_offset,
+                           MPI_Offset fd_size,
+                           MPI_Offset * fd_start, MPI_Offset * fd_end,
+                           PNCIO_Access * others_req,
+                           MPI_Count * send_buf_idx, MPI_Count * curr_to_proc,
+                           MPI_Count * done_to_proc, int *hole, int iter,
+                           MPI_Aint *buf_idx)
+{
+    int i, j, nprocs_recv, nprocs_send, err=NC_NOERR;
+    MPI_Count *tmp_len;
+    char **send_buf = NULL;
+    MPI_Request *requests, *send_req;
+    MPI_Datatype *recv_types, self_recv_type = MPI_DATATYPE_NULL;
+    MPI_Status *statuses, status;
+    MPI_Count sum, *srt_len = NULL;
+    int num_rtypes, nreqs;
+    MPI_Offset *srt_off = NULL;
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+double curT = MPI_Wtime();
+#endif
+
+/* exchange recv_size info so that each process knows how much to
+   send to whom. */
+
+    MPI_Alltoall(recv_size, 1, MPI_COUNT, send_size, 1, MPI_COUNT, fd->comm);
+
+    /* create derived datatypes for recv */
+
+    nprocs_send = 0;
+    nprocs_recv = 0;
+    sum = 0;
+    for (i = 0; i < nprocs; i++) {
+        sum += count[i];
+        if (recv_size[i])
+            nprocs_recv++;
+        if (send_size[i])
+            nprocs_send++;
+    }
+
+    recv_types = (MPI_Datatype *) NCI_Malloc((nprocs_recv + 1) * sizeof(MPI_Datatype));
+    /* +1 to avoid a 0-size malloc */
+
+    tmp_len = NCI_Malloc(nprocs * sizeof(*tmp_len));
+    j = 0;
+    for (i = 0; i < nprocs; i++) {
+        if (recv_size[i]) {
+            MPI_Datatype *dtype;
+            dtype = (i != myrank) ? (recv_types + j) : (&self_recv_type);
+
+            if (partial_recv[i]) {
+                /* take care if the last off-len pair is a partial recv */
+                MPI_Count k = start_pos[i] + count[i] - 1;
+                tmp_len[i] = others_req[i].lens[k];
+                others_req[i].lens[k] = partial_recv[i];
+            }
+#ifdef HAVE_MPI_LARGE_COUNT
+            MPI_Type_create_hindexed_c(count[i],
+                                       &(others_req[i].lens[start_pos[i]]),
+                                       &(others_req[i].mem_ptrs[start_pos[i]]),
+                                       MPI_BYTE, dtype);
+#else
+            MPI_Type_create_hindexed(count[i],
+                                     &(others_req[i].lens[start_pos[i]]),
+                                     &(others_req[i].mem_ptrs[start_pos[i]]),
+                                     MPI_BYTE, dtype);
+#endif
+            /* absolute displacements; use MPI_BOTTOM in recv */
+            MPI_Type_commit(dtype);
+            if (i != myrank)
+                j++;
+        }
+    }
+    num_rtypes = j;     /* number of non-self receive datatypes created */
+
+    /* To avoid a read-modify-write, check if there are holes in the
+     * data to be written. For this, merge the (sorted) offset lists
+     * others_req using a heap-merge. */
+
+/* TODO: PNCIO_Heap_merge is expensive, borrow codes from ad_lustre_wrcoll.c to skip it when possible */
+
+    /* valgrind-detcted optimization: if there is no work on this process we do
+     * not need to search for holes */
+    if (sum) {
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        double timing = MPI_Wtime();
+#endif
+        srt_off = (MPI_Offset *) NCI_Malloc(sum * sizeof(MPI_Offset));
+        srt_len = NCI_Malloc(sum * sizeof(*srt_len));
+
+        PNCIO_Heap_merge(others_req, count, srt_off, srt_len, start_pos,
+                         nprocs, nprocs_recv, sum);
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+        if (fd->is_agg) fd->write_timing[5] += MPI_Wtime() - timing;
+#endif
+    }
+
+    /* for partial recvs, restore original lengths */
+    for (i = 0; i < nprocs; i++)
+        if (partial_recv[i]) {
+            MPI_Count k = start_pos[i] + count[i] - 1;
+            others_req[i].lens[k] = tmp_len[i];
+        }
+    NCI_Free(tmp_len);
+
+    /* check if there are any holes. If yes, must do read-modify-write.
+     * holes can be in three places.  'middle' is what you'd expect: the
+     * processes are operating on noncontigous data.  But holes can also show
+     * up at the beginning or end of the file domain (see John Bent ROMIO REQ
+     * #835). Missing these holes would result in us writing more data than
+     * received by everyone else. */
+
+    *hole = 0;
+    if (sum) {
+        if (off != srt_off[0])  /* hole at the front */
+            *hole = 1;
+        else {  /* coalesce the sorted offset-length pairs */
+            for (i = 1; i < sum; i++) {
+                if (srt_off[i] <= srt_off[0] + srt_len[0]) {
+                    MPI_Count new_len = srt_off[i] + srt_len[i] - srt_off[0];
+                    if (new_len > srt_len[0])
+                        srt_len[0] = new_len;
+                } else
+                    break;
+            }
+            if (i < sum || size != srt_len[0])  /* hole in middle or end */
+                *hole = 1;
+        }
+
+        NCI_Free(srt_off);
+        NCI_Free(srt_len);
+    }
+
+    if (nprocs_recv) {
+        if (*hole) {
+            MPI_Offset r_len;
+            r_len = PNCIO_ReadContig(fd, write_buf, size, off);
+            if (r_len < 0) return r_len;
+        }
+    }
+
+    if (fd->atomicity) {
+        /* nreqs is the number of Isend and Irecv to be posted */
+        nreqs = (send_size[myrank]) ? (nprocs_send - 1) : nprocs_send;
+        requests = (MPI_Request *) NCI_Malloc((nreqs + 1) * sizeof(MPI_Request));
+        send_req = requests;
+    } else {
+        nreqs = nprocs_send + nprocs_recv;
+        if (send_size[myrank])  /* NO send to and recv from self */
+            nreqs -= 2;
+        requests = (MPI_Request *) NCI_Malloc((nreqs + 1) * sizeof(MPI_Request));
+        /* +1 to avoid a 0-size malloc */
+
+        /* post receives */
+        j = 0;
+        for (i = 0; i < nprocs; i++) {
+            if (recv_size[i] == 0)
+                continue;
+            if (i != myrank) {
+                MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, 0,
+                          fd->comm, requests + j);
+                j++;
+            } else if (buf_view.is_contig) {
+                /* sen/recv to/from self uses MPI_Unpack() */
+#ifdef HAVE_MPI_LARGE_COUNT
+                MPI_Count position=0;
+                MPI_Unpack_c((char *) buf + buf_idx[i], recv_size[i], &position,
+                             write_buf, 1, self_recv_type, MPI_COMM_SELF);
+#else
+                int position = 0;
+                assert(recv_size[i] < INT_MAX);
+                MPI_Unpack((char *) buf + buf_idx[i], (int)recv_size[i], &position,
+                           write_buf, 1, self_recv_type, MPI_COMM_SELF);
+#endif
+                buf_idx[i] += recv_size[i];
+            }
+        }
+        send_req = requests + j;
+    }
+
+/* post sends. if buf_view.is_contig, data can be directly sent from
+   user buf at location given by buf_idx. else use send_buf. */
+
+    if (buf_view.is_contig) {
+        j = 0;
+        for (i = 0; i < nprocs; i++)
+            if (send_size[i] && i != myrank) {
+                assert(buf_idx[i] != -1);
+#if MPI_VERSION >= 4
+                MPI_Isend_c((char *) buf + buf_idx[i], send_size[i],
+                            MPI_BYTE, i, 0, fd->comm, send_req + j);
+#else
+                MPI_Isend((char *) buf + buf_idx[i], send_size[i],
+                            MPI_BYTE, i, 0, fd->comm, send_req + j);
+#endif
+                j++;
+                buf_idx[i] += send_size[i];
+            }
+    } else if (nprocs_send) {
+        /* buftype is not contig */
+        size_t msgLen = 0;
+        for (i = 0; i < nprocs; i++)
+            msgLen += send_size[i];
+        send_buf = (char **) NCI_Malloc(nprocs * sizeof(char *));
+        send_buf[0] = (char *) NCI_Malloc(msgLen * sizeof(char));
+        for (i = 1; i < nprocs; i++)
+            send_buf[i] = send_buf[i - 1] + send_size[i - 1];
+
+        Fill_send_buffer(fd, buf, buf_view, send_buf, send_size, send_req,
+                         sent_to_proc, nprocs, myrank, min_st_offset, fd_size,
+                         fd_start, fd_end, send_buf_idx, curr_to_proc,
+                         done_to_proc, iter);
+
+        /* the send is done in Fill_send_buffer */
+    }
+
+    if (fd->atomicity) {
+        /* In atomic mode, we must use blocking receives to receive data in the
+         * same increasing order of MPI process rank IDs,
+         */
+        j = 0;
+        for (i = 0; i < nprocs; i++) {
+            if (recv_size[i] == 0)
+                continue;
+            if (i != myrank) {
+                MPI_Recv(MPI_BOTTOM, 1, recv_types[j++], i, 0,
+                         fd->comm, &status);
+            } else {
+                /* sen/recv to/from self uses MPI_Unpack() */
+                char *ptr = (buf_view.is_contig) ? (char *) buf + buf_idx[i] : send_buf[i];
+#ifdef HAVE_MPI_LARGE_COUNT
+                MPI_Count position=0;
+                MPI_Unpack_c(ptr, recv_size[i], &position, write_buf, 1, self_recv_type,
+                             MPI_COMM_SELF);
+#else
+                int position = 0;
+                assert(recv_size[i] < INT_MAX);
+                MPI_Unpack(ptr, (int)recv_size[i], &position, write_buf, 1, self_recv_type,
+                           MPI_COMM_SELF);
+#endif
+                buf_idx[i] += recv_size[i];
+            }
+        }
+    } else if (!buf_view.is_contig && recv_size[myrank]) {
+#ifdef HAVE_MPI_LARGE_COUNT
+        MPI_Count position=0;
+        MPI_Unpack_c(send_buf[myrank], recv_size[myrank], &position, write_buf, 1, self_recv_type,
+                     MPI_COMM_SELF);
+#else
+        int position = 0;
+        assert(recv_size[myrank] < INT_MAX);
+        MPI_Unpack(send_buf[myrank], (int)recv_size[myrank], &position, write_buf, 1, self_recv_type,
+                   MPI_COMM_SELF);
+#endif
+    }
+
+    for (i = 0; i < num_rtypes; i++)
+        MPI_Type_free(recv_types + i);
+    NCI_Free(recv_types);
+
+    if (self_recv_type != MPI_DATATYPE_NULL)
+        MPI_Type_free(&self_recv_type);
+
+#ifdef HAVE_MPI_STATUSES_IGNORE
+    statuses = MPI_STATUSES_IGNORE;
+#else
+    statuses = (MPI_Status *) NCI_Malloc(nreqs * sizeof(MPI_Status));
+#endif
+
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    if (fd->is_agg) fd->write_timing[4] += MPI_Wtime() - curT;
+    curT = MPI_Wtime();
+#endif
+    MPI_Waitall(nreqs, requests, statuses);
+#if defined(PNETCDF_PROFILING) && (PNETCDF_PROFILING == 1)
+    if (fd->is_agg) fd->write_timing[3] += MPI_Wtime() - curT;
+#endif
+
+#ifndef HAVE_MPI_STATUSES_IGNORE
+    NCI_Free(statuses);
+#endif
+    NCI_Free(requests);
+    if (!buf_view.is_contig && nprocs_send) {
+        NCI_Free(send_buf[0]);
+        NCI_Free(send_buf);
+    }
+
+    return err;
+}
+
+#define BUF_INCR \
+{ \
+    while (buf_incr) { \
+        size_in_buf = MIN(buf_incr, flat_buf_sz); \
+        user_buf_idx += size_in_buf; \
+        flat_buf_sz -= size_in_buf; \
+        buf_incr -= size_in_buf; \
+        if (buf_incr > 0 && flat_buf_sz == 0) { \
+            flat_buf_idx++; \
+            user_buf_idx = buf_view.off[flat_buf_idx]; \
+            flat_buf_sz = buf_view.len[flat_buf_idx]; \
+        } \
+    } \
+}
+
+#define BUF_COPY \
+{ \
+    while (size) { \
+        size_in_buf = MIN(size, flat_buf_sz); \
+        memcpy(&(send_buf[p][send_buf_idx[p]]), \
+               ((char *) buf) + user_buf_idx, size_in_buf); \
+        send_buf_idx[p] += size_in_buf; \
+        user_buf_idx += size_in_buf; \
+        flat_buf_sz -= size_in_buf; \
+        size -= size_in_buf; \
+        buf_incr -= size_in_buf; \
+        if (size > 0 && flat_buf_sz == 0) { \
+            flat_buf_idx++; \
+            user_buf_idx = buf_view.off[flat_buf_idx]; \
+            flat_buf_sz = buf_view.len[flat_buf_idx]; \
+        } \
+    } \
+    BUF_INCR \
+}
+
+static
+void Fill_send_buffer(PNCIO_File *fd, void *buf,
+                      PNCIO_View buf_view, char **send_buf,
+                      MPI_Count * send_size,
+                      MPI_Request * requests, MPI_Count * sent_to_proc,
+                      int nprocs, int myrank,
+                      MPI_Offset min_st_offset, MPI_Offset fd_size,
+                      MPI_Offset * fd_start, MPI_Offset * fd_end,
+                      MPI_Count * send_buf_idx, MPI_Count * curr_to_proc,
+                      MPI_Count * done_to_proc, int iter)
+{
+/* this function is only called if buftype is not contig */
+
+    int p, jj;
+    MPI_Offset flat_buf_idx, flat_buf_sz, size_in_buf, buf_incr, size;
+    MPI_Offset off, len, rem_len, user_buf_idx;
+
+/*  curr_to_proc[p] = amount of data sent to proc. p that has already
+    been accounted for so far
+    done_to_proc[p] = amount of data already sent to proc. p in
+    previous iterations
+    user_buf_idx = current location in user buffer
+    send_buf_idx[p] = current location in send_buf of proc. p  */
+
+    for (MPI_Count i = 0; i < nprocs; i++) {
+        send_buf_idx[i] = curr_to_proc[i] = 0;
+        done_to_proc[i] = sent_to_proc[i];
+    }
+    jj = 0;
+
+    user_buf_idx = buf_view.off[0];
+    flat_buf_idx = 0;
+    flat_buf_sz = buf_view.len[0];
+
+    /* flat_buf_idx = current index into flattened buftype
+     * flat_buf_sz = size of current contiguous component in
+     * flattened buf */
+
+    for (MPI_Count i = 0; i < fd->flat_file.count; i++) {
+        off = fd->flat_file.off[i];
+        rem_len = fd->flat_file.len[i];
+
+        /*this request may span the file domains of more than one process */
+        while (rem_len != 0) {
+            len = rem_len;
+            /* NOTE: len value is modified by PNCIO_Calc_aggregator() to be no
+             * longer than the single region that processor "p" is responsible
+             * for.
+             */
+            p = PNCIO_Calc_aggregator(fd, off, min_st_offset, &len, fd_size, fd_end);
+
+            if (send_buf_idx[p] < send_size[p]) {
+                if (curr_to_proc[p] + len > done_to_proc[p]) {
+                    if (done_to_proc[p] > curr_to_proc[p]) {
+                        size = MIN(curr_to_proc[p] + len -
+                                       done_to_proc[p], send_size[p] - send_buf_idx[p]);
+                        buf_incr = done_to_proc[p] - curr_to_proc[p];
+                        BUF_INCR
+                        buf_incr = curr_to_proc[p] + len - done_to_proc[p];
+                        /* ok to cast: bounded by cb buffer size */
+                        curr_to_proc[p] = done_to_proc[p] + size;
+                        BUF_COPY
+                    } else {
+                        size = MIN(len, send_size[p] - send_buf_idx[p]);
+                        buf_incr = len;
+                        curr_to_proc[p] += size;
+                        BUF_COPY
+                    }
+                    if (send_buf_idx[p] == send_size[p] && p != myrank) {
+#if MPI_VERSION >= 4
+                        MPI_Isend_c(send_buf[p], send_size[p], MPI_BYTE, p,
+                                    0, fd->comm, &requests[jj++]);
+#else
+                        MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p,
+                                    0, fd->comm, &requests[jj++]);
+#endif
+                    }
+                } else {
+                    curr_to_proc[p] += len;
+                    buf_incr = len;
+                    BUF_INCR
+                }
+            } else {
+                buf_incr = len;
+                BUF_INCR
+            }
+            off += len;
+            rem_len -= len;
+        }
+    }
+    for (int i = 0; i < nprocs; i++) {
+        if (send_size[i]) {
+            sent_to_proc[i] = curr_to_proc[i];
+        }
+    }
+}
diff --git a/src/drivers/pncio/pncio_write_str.c b/src/drivers/pncio/pncio_write_str.c
new file mode 100644
index 000000000..cb4ac25e8
--- /dev/null
+++ b/src/drivers/pncio/pncio_write_str.c
@@ -0,0 +1,328 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <pncio.h>
+
+#define BUFFERED_WRITE {                                                      \
+    if (req_off >= writebuf_off + writebuf_len) {                             \
+        if (writebuf_len) {                                                   \
+            w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len,             \
+                                      writebuf_off);                          \
+            if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE)  \
+                    PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);   \
+            if (w_len < 0) goto fn_exit;                                      \
+            total_w_len += w_len;                                             \
+        }                                                                     \
+        writebuf_off = req_off;                                               \
+        writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1);            \
+        if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE)      \
+            PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);       \
+        r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off);   \
+        if (r_len < 0) goto fn_exit;                                          \
+    }                                                                         \
+    write_sz = (MPI_Aint)MIN(req_len, writebuf_off+writebuf_len-req_off);     \
+    memcpy(writebuf+req_off-writebuf_off, (char*)buf +userbuf_off, write_sz); \
+    while (write_sz != req_len) {                                             \
+        w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off);  \
+        if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE)      \
+            PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);           \
+        if (w_len < 0) goto fn_exit;                                          \
+        total_w_len += w_len;                                                 \
+        req_len -= write_sz;                                                  \
+        userbuf_off += write_sz;                                              \
+        writebuf_off += writebuf_len;                                         \
+        writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1);            \
+        if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE)      \
+            PNCIO_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len);       \
+        r_len = PNCIO_ReadContig(fd, writebuf, writebuf_len, writebuf_off);   \
+        if (r_len < 0) goto fn_exit;                                          \
+        write_sz = MIN(req_len, writebuf_len);                                \
+        memcpy(writebuf, (char *)buf + userbuf_off, write_sz);                \
+    }                                                                         \
+}
+
+
+MPI_Offset PNCIO_GEN_WriteStrided(PNCIO_File *fd,
+                                  const void *buf,
+                                  PNCIO_View  buf_view,
+                                  MPI_Offset  offset)
+{
+
+/* offset is in units of etype relative to the filetype. */
+
+    char *writebuf = NULL;
+    int i, j, k, st_index = 0;
+    MPI_Aint writebuf_len, max_bufsize, write_sz, bufsize;
+    MPI_Offset i_offset, sum, num, size, abs_off_in_filetype=0;
+    MPI_Offset userbuf_off, off, req_off, disp, end_offset=0;
+    MPI_Offset writebuf_off, start_off, new_bwr_size, new_fwr_size;
+    MPI_Offset st_fwr_size, fwr_size = 0, bwr_size, req_len;
+    MPI_Offset r_len, w_len, total_w_len=0;
+
+    /* Contiguous both in buftype and filetype should have been handled in a
+     * call to PNCIO_WriteContig() earlier.
+     */
+    assert(!(buf_view.is_contig && fd->flat_file.is_contig));
+
+    if (fd->hints->ds_write == PNCIO_HINT_DISABLE) {
+        /* If user has disabled data sieving on reads, use naive approach
+         * instead.
+         */
+        return PNCIO_GEN_WriteStrided_naive(fd, buf, buf_view, offset);
+    }
+
+// printf("%s at %d: offset=%lld\n",__func__,__LINE__, offset);
+
+/* PnetCDF always set these 3 conditions */
+assert(fd->filetype == MPI_BYTE);
+assert(fd->flat_file.size == buf_view.size);
+if (fd->flat_file.count > 0) assert(offset == 0); /* not whole file visible */
+
+    bufsize = buf_view.size;
+
+    /* get max_bufsize from the info object. */
+    max_bufsize = fd->hints->ind_wr_buffer_size;
+
+    if (!buf_view.is_contig && fd->flat_file.is_contig) {
+        /* noncontiguous in memory, contiguous in file. */
+
+        off = fd->disp + offset;
+assert(fd->disp == 0);
+        if (fd->flat_file.count > 0) off += fd->flat_file.off[0];
+
+        start_off = off;
+        end_offset = off + bufsize - 1;
+        writebuf_off = off;
+        writebuf = (char *) NCI_Malloc(max_bufsize);
+        writebuf_len = MIN(max_bufsize, end_offset - writebuf_off + 1);
+
+        /* if atomicity is true or data sieving is not disable, lock the region
+         * to be accessed */
+        if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE)
+            PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
+
+        for (i = 0; i < buf_view.count; i++) {
+            userbuf_off = buf_view.off[i];
+            req_off = off;
+            req_len = buf_view.len[i];
+
+            /* BUFFERED_WRITE_WITHOUT_READ does neither read-modify-write nor
+             * file lock
+             */
+            if (req_off >= writebuf_off + writebuf_len) {
+                w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len,
+                                          writebuf_off);
+                if (w_len < 0) goto fn_exit;
+                total_w_len += w_len;
+                writebuf_off = req_off;
+                writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1);
+            }
+            write_sz = MIN(req_len, writebuf_off + writebuf_len - req_off);
+            memcpy(writebuf+req_off-writebuf_off, (char*)buf +userbuf_off,
+                   write_sz);
+            while (write_sz != req_len) {
+                w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len,
+                                          writebuf_off);
+                if (w_len < 0) goto fn_exit;
+                total_w_len += w_len;
+                req_len -= write_sz;
+                userbuf_off += write_sz;
+                writebuf_off += writebuf_len;
+                writebuf_len = MIN(max_bufsize,end_offset-writebuf_off+1);
+                write_sz = MIN(req_len, writebuf_len);
+                memcpy(writebuf, (char *)buf + userbuf_off, write_sz);
+            }
+
+            off += buf_view.len[i];
+        }
+
+        /* write the buffer out finally */
+        if (writebuf_len) {
+            w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off);
+            if (w_len >= 0) total_w_len += w_len;
+        }
+        else
+            w_len = 0;
+
+        if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE)
+            PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+        if (w_len < 0)
+            goto fn_exit;
+    }
+    else { /* noncontiguous in file */
+        MPI_Offset size_in_filetype = offset;
+
+        disp = fd->disp;
+assert(fd->disp == 0);
+
+        sum = 0;
+        for (i = 0; i < fd->flat_file.count; i++) {
+            sum += fd->flat_file.len[i];
+            if (sum > size_in_filetype) {
+                st_index = i;
+                fwr_size = sum - size_in_filetype;
+                abs_off_in_filetype = fd->flat_file.off[i] +
+                    size_in_filetype - (sum - fd->flat_file.len[i]);
+                break;
+            }
+        }
+
+        /* abs. offset in bytes in the file */
+        offset = disp + abs_off_in_filetype;
+
+        start_off = offset;
+assert(offset == abs_off_in_filetype);
+
+// printf("%s at %d: start_off=%lld abs_off_in_filetype=%lld\n",__func__,__LINE__,start_off,abs_off_in_filetype);
+
+        /* Write request is within single flat_file contig block. This could
+         * happen, for example, with subarray types that are actually fairly
+         * contiguous.
+         */
+        if (buf_view.is_contig && bufsize <= fwr_size) {
+            /* though MPI api has an integer 'count' parameter, derived
+             * datatypes might describe more bytes than can fit into an integer.
+             * if we've made it this far, we can pass a count of original
+             * datatypes, instead of a count of bytes (which might overflow)
+             * Other WriteContig calls in this path are operating on data
+             * sieving buffer */
+            PNCIO_WRITE_LOCK(fd, offset, SEEK_SET, bufsize);
+            w_len = PNCIO_WriteContig(fd, buf, buf_view.size, offset);
+            if (w_len > 0) total_w_len += w_len;
+            PNCIO_UNLOCK(fd, offset, SEEK_SET, bufsize);
+
+            goto fn_exit;
+        }
+
+        /* Calculate end_offset, the last byte-offset that will be accessed.
+         * e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */
+
+        st_fwr_size = fwr_size;
+        j = st_index;
+        fwr_size = MIN(fwr_size, bufsize);
+        i_offset = fwr_size;
+        end_offset = offset + fwr_size - 1;
+        while (i_offset < bufsize) {
+            j++;
+            fwr_size = MIN(fd->flat_file.len[j], bufsize - i_offset);
+            i_offset += fwr_size;
+            end_offset = disp + fd->flat_file.off[j] + fwr_size - 1;
+        }
+
+        /* if atomicity is true or data sieving is not disable, lock the region
+         * to be accessed */
+        if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE)
+            PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
+
+        writebuf_off = 0;
+        writebuf_len = 0;
+        writebuf = (char *) NCI_Malloc(max_bufsize);
+        memset(writebuf, -1, max_bufsize);
+
+        if (buf_view.is_contig && !fd->flat_file.is_contig) {
+            /* contiguous in memory, noncontiguous in file should be the most
+             * common case.
+             */
+            i_offset = 0;
+            j = st_index;
+            off = offset;
+            fwr_size = MIN(st_fwr_size, bufsize);
+            while (i_offset < bufsize) {
+                if (fwr_size) {
+                    req_off = off;
+                    req_len = fwr_size;
+                    userbuf_off = i_offset;
+                    BUFFERED_WRITE;
+                }
+
+                i_offset += fwr_size;
+                if (i_offset >= bufsize) break;
+
+                if (off + fwr_size < disp + fd->flat_file.off[j] +
+                                            fd->flat_file.len[j])
+                    off += fwr_size; /* off is incremented by fwr_size. */
+                else {
+                    j++;
+assert(j < fd->flat_file.count);
+                    off = disp + fd->flat_file.off[j];
+                    fwr_size = MIN(fd->flat_file.len[j],
+                                       bufsize - i_offset);
+                }
+            }
+        } else {
+            /* noncontiguous in memory as well as in file */
+            k = num = 0;
+            i_offset = buf_view.off[0];
+            j = st_index;
+            off = offset;
+            fwr_size = st_fwr_size;
+            bwr_size = buf_view.len[0];
+
+            while (num < bufsize) {
+                size = MIN(fwr_size, bwr_size);
+                if (size) {
+                    req_off = off;
+                    req_len = size;
+                    userbuf_off = i_offset;
+                    BUFFERED_WRITE;
+                }
+
+                num += size;
+                if (num >= bufsize) break;
+
+                new_fwr_size = fwr_size;
+                new_bwr_size = bwr_size;
+
+                if (size == fwr_size) {
+                    j++;
+assert(j < fd->flat_file.count);
+                    off = disp + fd->flat_file.off[j];
+                    new_fwr_size = fd->flat_file.len[j];
+                    if (size != bwr_size) {
+                        i_offset += size;
+                        new_bwr_size -= size;
+                    }
+                }
+
+                if (size == bwr_size) {
+                    /* reached end of contiguous block in memory */
+
+                    k++;
+assert(k < buf_view.count);
+                    i_offset = buf_view.off[k];
+                    new_bwr_size = buf_view.len[k];
+                    if (size != fwr_size) {
+                        off += size;
+                        new_fwr_size -= size;
+                    }
+                }
+                fwr_size = new_fwr_size;
+                bwr_size = new_bwr_size;
+            }
+        }
+
+        /* write the buffer out finally */
+        if (writebuf_len) {
+            w_len = PNCIO_WriteContig(fd, writebuf, writebuf_len, writebuf_off);
+            if (!fd->atomicity && fd->hints->ds_write == PNCIO_HINT_DISABLE)
+                PNCIO_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len);
+            if (w_len < 0) goto fn_exit;
+            total_w_len += w_len;
+        }
+        if (fd->atomicity || fd->hints->ds_write != PNCIO_HINT_DISABLE)
+            PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+    }
+
+fn_exit:
+    if (writebuf != NULL)
+        NCI_Free(writebuf);
+
+    return total_w_len;
+}
diff --git a/src/drivers/pncio/pncio_write_str_naive.c b/src/drivers/pncio/pncio_write_str_naive.c
new file mode 100644
index 000000000..572ed855b
--- /dev/null
+++ b/src/drivers/pncio/pncio_write_str_naive.c
@@ -0,0 +1,245 @@
+/*
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ */
+
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+
+#include <pncio.h>
+
+MPI_Offset PNCIO_GEN_WriteStrided_naive(PNCIO_File      *fd,
+                                        const void      *buf,
+                                        PNCIO_View  buf_view,
+                                        MPI_Offset       offset)
+{
+    int b_index;
+    MPI_Count bufsize;
+
+    /* bwr == buffer write; fwr == file write */
+    MPI_Offset bwr_size, fwr_size = 0, sum, size_in_filetype, size;
+    MPI_Offset abs_off_in_filetype = 0, req_len, userbuf_off;
+    MPI_Offset off, req_off, disp, end_offset = 0, start_off;
+    MPI_Offset w_len, total_w_len=0;
+
+/* PnetCDF always sets fd->filetype == MPI_BYTE */
+assert(fd->filetype == MPI_BYTE);
+
+    /* Contiguous both in buftype and filetype should have been handled in a
+     * call to PNCIO_WriteContig() earlier.
+     */
+    assert(!(buf_view.is_contig && fd->flat_file.is_contig));
+
+    bufsize = buf_view.size;
+
+    if (!buf_view.is_contig && fd->flat_file.is_contig) {
+        /* noncontiguous in memory, contiguous in file. */
+
+        off = fd->disp + offset;
+
+        start_off = off;
+        end_offset = off + bufsize - 1;
+
+        /* if atomicity is true, lock (exclusive) the region to be accessed */
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
+
+        /* for each region in the buffer, grab the data and put it in place */
+        for (b_index = 0; b_index < buf_view.count; b_index++) {
+            userbuf_off = buf_view.off[b_index];
+            req_off = off;
+            req_len = buf_view.len[b_index];
+
+            w_len = PNCIO_WriteContig(fd, (char *) buf + userbuf_off,
+                                      req_len, req_off);
+            if (w_len < 0) return w_len;
+            total_w_len += w_len;
+
+            /* off is (potentially) used to save the final offset later */
+            off += buf_view.len[b_index];
+        }
+
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
+    }
+    else {      /* noncontiguous in file */
+        int f_index, st_index = 0;
+        MPI_Offset st_fwr_size;
+
+        /* First we're going to calculate a set of values for use in all
+         * the noncontiguous in file cases:
+         * start_off - starting byte position of data in file
+         * end_offset - last byte offset to be accessed in the file
+         * st_index - index of block in first filetype that we will be
+         *            starting in (?)
+         * st_fwr_size - size of the data in the first filetype block
+         *               that we will write (accounts for being part-way
+         *               into writing this block of the filetype
+         */
+
+        disp = fd->disp;
+
+        size_in_filetype = offset;
+
+        sum = 0;
+        for (f_index = 0; f_index < fd->flat_file.count; f_index++) {
+            sum += fd->flat_file.len[f_index];
+            if (sum > size_in_filetype) {
+                st_index = f_index;
+                fwr_size = sum - size_in_filetype;
+                abs_off_in_filetype = fd->flat_file.off[f_index] +
+                    size_in_filetype - (sum - fd->flat_file.len[f_index]);
+                break;
+            }
+        }
+
+        /* abs. offset in bytes in the file */
+        start_off = disp + abs_off_in_filetype;
+
+        st_fwr_size = fwr_size;
+
+        /* start_off, st_index, and st_fwr_size are
+         * all calculated at this point
+         */
+
+        /* Calculate end_offset, the last byte-offset that will be accessed.
+         * e.g., if start_off=0 and 100 bytes to be written, end_offset=99
+         */
+        f_index = st_index;
+        fwr_size = MIN(st_fwr_size, bufsize);
+        userbuf_off = fwr_size;
+        end_offset = start_off + fwr_size - 1;
+        while (userbuf_off < bufsize) {
+            f_index++;
+            fwr_size = MIN(fd->flat_file.len[f_index],
+                               bufsize - userbuf_off);
+            userbuf_off += fwr_size;
+            end_offset = disp + fd->flat_file.off[f_index] + fwr_size - 1;
+        }
+
+        /* End of calculations.  At this point the following values have
+         * been calculated and are ready for use:
+         * - start_off
+         * - end_offset
+         * - st_index
+         * - st_fwr_size
+         */
+
+        /* if atomicity is true, lock (exclusive) the region to be accessed */
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1);
+
+        if (buf_view.is_contig && !fd->flat_file.is_contig) {
+            /* contiguous in memory, noncontiguous in file. should be the
+             * most common case.
+             */
+
+            userbuf_off = 0;
+            f_index = st_index;
+            off = start_off;
+            fwr_size = MIN(st_fwr_size, bufsize);
+
+            /* while there is still space in the buffer, write more data */
+            while (userbuf_off < bufsize) {
+                if (fwr_size) {
+                    /* TYPE_UB and TYPE_LB can result in
+                     * fwr_size = 0. save system call in such cases */
+                    req_off = off;
+                    req_len = fwr_size;
+
+                    w_len = PNCIO_WriteContig(fd, (char *) buf + userbuf_off,
+                                              req_len, req_off);
+                    if (w_len < 0) return w_len;
+                    total_w_len += w_len;
+                }
+                userbuf_off += fwr_size;
+                if (userbuf_off >= bufsize) break;
+
+                if (off + fwr_size < disp + fd->flat_file.off[f_index] +
+                    fd->flat_file.len[f_index]) {
+                    /* important that this value be correct, as it is
+                     * used to set the offset in the fd near the end of
+                     * this function.
+                     */
+                    off += fwr_size;
+                }
+                /* did not reach end of contiguous block in filetype.
+                 * no more I/O needed. off is incremented by fwr_size.
+                 */
+                else {
+                    f_index++;
+assert(f_index < fd->flat_file.count);
+                    off = disp + fd->flat_file.off[f_index];
+                    fwr_size = MIN(fd->flat_file.len[f_index],
+                                       bufsize - userbuf_off);
+                }
+            }
+        } else {
+            MPI_Offset i_offset, tmp_bufsize = 0;
+            /* noncontiguous in memory as well as in file */
+
+            b_index = 0;
+            i_offset = buf_view.off[0];
+            f_index = st_index;
+            off = start_off;
+            fwr_size = st_fwr_size;
+            bwr_size = buf_view.len[0];
+
+            /* while we haven't read size * count bytes, keep going */
+            while (tmp_bufsize < bufsize) {
+                MPI_Offset new_bwr_size = bwr_size, new_fwr_size = fwr_size;
+
+                size = MIN(fwr_size, bwr_size);
+                /* keep max of a single read amount <= INT_MAX */
+                size = MIN(size, INT_MAX);
+
+                if (size) {
+                    req_off = off;
+                    req_len = size;
+                    userbuf_off = i_offset;
+
+                    w_len = PNCIO_WriteContig(fd, (char *) buf + userbuf_off,
+                                              req_len, req_off);
+                    if (w_len < 0) return w_len;
+                    total_w_len += w_len;
+                }
+
+                if (tmp_bufsize >= bufsize) break;
+                tmp_bufsize += size;
+
+                if (size == fwr_size) {
+                    f_index++;
+assert(f_index < fd->flat_file.count);
+                    off = disp + fd->flat_file.off[f_index];
+                    new_fwr_size = fd->flat_file.len[f_index];
+                    if (size != bwr_size) {
+                        i_offset += size;
+                        new_bwr_size -= size;
+                    }
+                }
+
+                if (size == bwr_size) {
+                    /* reached end of contiguous block in memory */
+                    b_index++;
+assert(b_index < buf_view.count);
+                    i_offset = buf_view.off[b_index];
+                    new_bwr_size = buf_view.len[b_index];
+                    if (size != fwr_size) {
+                        off += size;
+                        new_fwr_size -= size;
+                    }
+                }
+                fwr_size = new_fwr_size;
+                bwr_size = new_bwr_size;
+            }
+        }
+
+        /* unlock the file region if we locked it */
+        if ((fd->atomicity) && PNCIO_Feature(fd, PNCIO_LOCKS))
+            PNCIO_UNLOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1);
+
+    }   /* end of (else noncontiguous in file) */
+
+    return total_w_len;
+}
diff --git a/src/include/pnc_debug.h b/src/include/pnc_debug.h
index 976244896..fcee81633 100644
--- a/src/include/pnc_debug.h
+++ b/src/include/pnc_debug.h
@@ -55,6 +55,27 @@
     }                                                                   \
     return err;                                                         \
 }
+#define DEBUG_FOPEN_ERROR(err) {                                        \
+    if (ncp->ina_comm != MPI_COMM_NULL) MPI_Comm_free(&ncp->ina_comm);  \
+    char *_env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE");              \
+    if (_env_str != NULL && *_env_str != '0') {                         \
+        int _rank;                                                      \
+        MPI_Comm_rank(MPI_COMM_WORLD, &_rank);                          \
+        fprintf(stderr, "Rank %d: %s error at line %d of %s in %s\n",   \
+        _rank,ncmpi_strerrno(err),__LINE__,__func__,__FILE__);          \
+    }                                                                   \
+    return err;                                                         \
+}
+#define DEBUG_RETURN_ERROR_MSG(err, msg) {                                 \
+    char *_env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE");                 \
+    if (_env_str != NULL && *_env_str != '0') {                            \
+        int _rank;                                                         \
+        MPI_Comm_rank(MPI_COMM_WORLD, &_rank);                             \
+        fprintf(stderr, "Rank %d: %s error at line %d of %s in %s (%s)\n", \
+        _rank,ncmpi_strerrno(err),__LINE__,__func__,__FILE__, msg);        \
+    }                                                                      \
+    return err;                                                            \
+}
 #define DEBUG_ASSIGN_ERROR(status, err) {                               \
     char *_env_str = getenv("PNETCDF_VERBOSE_DEBUG_MODE");              \
     if (_env_str != NULL && *_env_str != '0') {                         \
@@ -76,6 +97,11 @@
 }
 #else
 #define DEBUG_RETURN_ERROR(err) return err;
+#define DEBUG_RETURN_ERROR_MSG(err, msg) return err;
+#define DEBUG_FOPEN_ERROR(err) {                                        \
+    if (ncp->ina_comm != MPI_COMM_NULL) MPI_Comm_free(&ncp->ina_comm);  \
+    return err;                                                         \
+}
 #define DEBUG_ASSIGN_ERROR(status, err) status = err;
 #define DEBUG_TRACE_ERROR(err)
 #endif
diff --git a/src/include/pnetcdf.h.in b/src/include/pnetcdf.h.in
index b363276b6..df7e9f66a 100644
--- a/src/include/pnetcdf.h.in
+++ b/src/include/pnetcdf.h.in
@@ -16,6 +16,7 @@
 #define PNETCDF_VERSION_MAJOR @PNETCDF_VERSION_MAJOR@
 #define PNETCDF_VERSION_MINOR @PNETCDF_VERSION_MINOR@
 #define PNETCDF_VERSION_SUB   @PNETCDF_VERSION_SUB@
+#define PNETCDF_VERSION_PRE   "@PNETCDF_VERSION_PRE@"
 #define PNETCDF_RELEASE_DATE  "DIST_DATE"
 
 /* List of PnetCDF features enabled/disabled at configure time.
@@ -657,6 +658,7 @@ by the desired type. */
 #define NC_EBADLOG			(-238) /**< Unrecognized log file format */
 #define NC_EFLUSHED			(-239) /**< Nonblocking request has already been flushed. It is too late to cancel */
 #define NC_EADIOS			(-240) /**< unknown ADIOS error */
+#define NC_EFSTYPE			(-241) /**< Invalid file system type */
 /* add new error here */
 
 /* header inconsistency errors start from -250 */
diff --git a/src/libs/Makefile.am b/src/libs/Makefile.am
index a932f20f5..17ea3ed75 100644
--- a/src/libs/Makefile.am
+++ b/src/libs/Makefile.am
@@ -23,6 +23,7 @@ libpnetcdf_la_SOURCES =
 libpnetcdf_la_LIBADD += ../dispatchers/libdispatchers.la
 libpnetcdf_la_LIBADD += ../drivers/common/libcommon.la
 libpnetcdf_la_LIBADD += ../drivers/ncmpio/libncmpio.la
+libpnetcdf_la_LIBADD += ../drivers/pncio/libpncio.la
 if BUILD_DRIVER_FOO
    libpnetcdf_la_LIBADD += ../drivers/ncfoo/libncfoo.la
 endif
@@ -71,6 +72,9 @@ endif
 ../drivers/ncmpio/libncmpio.la:
 	set -e; cd ../drivers/ncmpio && $(MAKE) $(MFLAGS)
 
+../drivers/pncio/libpncio.la:
+	set -e; cd ../drivers/pncio && $(MAKE) $(MFLAGS)
+
 ../drivers/ncncio/libncncio.la:
 	set -e; cd ../drivers/ncncio && $(MAKE) $(MFLAGS)
 
diff --git a/src/utils/ncmpidiff/cdfdiff.c b/src/utils/ncmpidiff/cdfdiff.c
index be19bc0c7..73426fc7c 100644
--- a/src/utils/ncmpidiff/cdfdiff.c
+++ b/src/utils/ncmpidiff/cdfdiff.c
@@ -187,9 +187,9 @@ struct vspec {
 
 /*----< get_var_names() >-----------------------------------------------------*/
 static void
-get_var_names(char *optarg, struct vspec* vspecp)
+get_var_names(char *opt_arg, struct vspec* vspecp)
 {
-    char *cp=optarg, **cpp;
+    char *cp=opt_arg, **cpp;
     int nvars = 1;
 
     /* compute number of variable names in comma-delimited list */
@@ -203,7 +203,7 @@ get_var_names(char *optarg, struct vspec* vspecp)
 
     cpp = vspecp->names;
     /* copy variable names into list */
-    for (cp = strtok(optarg, ",");
+    for (cp = strtok(opt_arg, ",");
          cp != NULL;
          cp = strtok((char *) NULL, ",")) {
 
@@ -237,11 +237,11 @@ get_type(int type)
 /*----< main() >--------------------------------------------------------------*/
 int main(int argc, char **argv)
 {
+    /* int verbose; is defined as a locally global variable in ncvalidator.c */
     extern char *optarg;
     extern int optind;
-    char *str, *ptr;
     size_t nbytes;
-    int i, j, k, m, n, c, err, verbose, quiet, isDiff;
+    int i, j, k, m, n, c, err, quiet, isDiff;
     int fd[2], nvars[2], ndims[2], nattrs[2], check_tolerance;
     int cmp_nvars, check_header, check_variable_list, check_entire_file;
     long long numVarDIFF=0, numHeadDIFF=0, numDIFF;
@@ -264,7 +264,8 @@ int main(int argc, char **argv)
     var_list.nvars      = 0;
     check_tolerance     = 0;
 
-    while ((c = getopt(argc, argv, "bhqv:t:")) != -1)
+    while ((c = getopt(argc, argv, "bhqv:t:")) != -1) {
+        char *str, *ptr;
         switch(c) {
             case 'h':               /* compare header only */
                 check_header = 1;
@@ -301,6 +302,7 @@ int main(int argc, char **argv)
                 usage(argv[0]);
                 break;
         }
+    }
 
     /* quiet mode overwrites verbose */
     if (quiet) verbose = 0;
diff --git a/src/utils/ncmpidiff/ncmpidiff.c b/src/utils/ncmpidiff/ncmpidiff.c
index 8f1fe4f20..6eed1bb98 100644
--- a/src/utils/ncmpidiff/ncmpidiff.c
+++ b/src/utils/ncmpidiff/ncmpidiff.c
@@ -60,6 +60,15 @@
     }                                                                        \
 }
 
+#define HANDLE_FILE_ERR(filename) {                                          \
+    if (err != NC_NOERR) {                                                   \
+        fprintf(stderr, "Error at line %d: input file %s (%s)\n", __LINE__,  \
+               filename, ncmpi_strerror(err));                               \
+        MPI_Abort(MPI_COMM_WORLD, -1);                                       \
+        exit(-1);                                                            \
+    }                                                                        \
+}
+
 #define CHECK_GLOBAL_ATT_DIFF_CHAR {                                         \
     int pos;                                                                 \
     char *b1 = (char *)calloc((attlen[0] + 1) * 2, sizeof(char));            \
@@ -300,9 +309,9 @@ struct vspec {
 
 /*----< get_var_names() >-----------------------------------------------------*/
 static void
-get_var_names(char *optarg, struct vspec* vspecp)
+get_var_names(char *opt_arg, struct vspec* vspecp)
 {
-    char *cp=optarg, **cpp;
+    char *cp=opt_arg, **cpp;
     int nvars = 1;
 
     /* compute number of variable names in comma-delimited list */
@@ -316,7 +325,7 @@ get_var_names(char *optarg, struct vspec* vspecp)
 
     cpp = vspecp->names;
     /* copy variable names into list */
-    for (cp = strtok(optarg, ",");
+    for (cp = strtok(opt_arg, ",");
          cp != NULL;
          cp = strtok((char *) NULL, ",")) {
 
@@ -450,7 +459,7 @@ int main(int argc, char **argv)
 
         /* file format version */
         err = ncmpi_inq_file_format(argv[optind+i], &fmt[i]);
-        HANDLE_ERROR
+        HANDLE_FILE_ERR(argv[optind+i])
 
         if (fmt[i] == NC_FORMAT_NETCDF4 || fmt[i] == NC_FORMAT_NETCDF4_CLASSIC) {
 #ifndef ENABLE_NETCDF4
diff --git a/src/utils/ncmpidump/ncmpidump.c b/src/utils/ncmpidump/ncmpidump.c
index be3d72482..77829ef18 100644
--- a/src/utils/ncmpidump/ncmpidump.c
+++ b/src/utils/ncmpidump/ncmpidump.c
@@ -51,9 +51,9 @@ static void pr_att_string(size_t len, const char* string);
 static void pr_att_vals(nc_type  type, size_t len, const double* vals);
 static void pr_att(int ncid, int varid, const char *varname, int ia);
 static void do_ncdump(const char* path, struct fspec* specp);
-static void make_lvars(char* optarg, struct fspec* fspecp);
-static void set_sigdigs( const char* optarg);
-static void set_precision( const char *optarg);
+static void make_lvars(char* opt_arg, struct fspec* fspecp);
+static void set_sigdigs( const char* opt_arg);
+static void set_precision( const char *opt_arg);
 int main(int argc, char** argv);
 
 #define    STREQ(a, b)    (*(a) == *(b) && strcmp((a), (b)) == 0)
@@ -611,9 +611,9 @@ do_ncdump(const char *path, struct fspec* specp)
 
 
 static void
-make_lvars(char *optarg, struct fspec* fspecp)
+make_lvars(char *opt_arg, struct fspec* fspecp)
 {
-    char *cp = optarg;
+    char *cp = opt_arg;
     int nvars = 1;
     char ** cpp;
 
@@ -628,7 +628,7 @@ make_lvars(char *optarg, struct fspec* fspecp)
 
     cpp = fspecp->lvars;
     /* copy variable names into list */
-    for (cp = strtok(optarg, ",");
+    for (cp = strtok(opt_arg, ",");
          cp != NULL;
          cp = strtok((char *) NULL, ",")) {
 
@@ -647,15 +647,15 @@ make_lvars(char *optarg, struct fspec* fspecp)
  * command-line and update the default data formats appropriately.
  */
 static void
-set_sigdigs(const char *optarg)
+set_sigdigs(const char *opt_arg)
 {
     char *ptr1 = 0;
     char *ptr2 = 0;
     int flt_digits = FLT_DIGITS; /* default floating-point digits */
     int dbl_digits = DBL_DIGITS; /* default double-precision digits */
 
-    if (optarg != 0 && (int) strlen(optarg) > 0 && optarg[0] != ',')
-        flt_digits = (int)strtol(optarg, &ptr1, 10);
+    if (opt_arg != 0 && (int) strlen(opt_arg) > 0 && opt_arg[0] != ',')
+        flt_digits = (int)strtol(opt_arg, &ptr1, 10);
 
     if (flt_digits < 1 || flt_digits > 20)
         error("unreasonable value for float significant digits: %d",
@@ -679,15 +679,15 @@ set_sigdigs(const char *optarg)
  * and update the default data formats appropriately.
  */
 static void
-set_precision(const char *optarg)
+set_precision(const char *opt_arg)
 {
     char *ptr1 = 0;
     char *ptr2 = 0;
     int flt_digits = FLT_DIGITS;    /* default floating-point digits */
     int dbl_digits = DBL_DIGITS;    /* default double-precision digits */
 
-    if (optarg != 0 && (int) strlen(optarg) > 0 && optarg[0] != ',') {
-        flt_digits = (int)strtol(optarg, &ptr1, 10);
+    if (opt_arg != 0 && (int) strlen(opt_arg) > 0 && opt_arg[0] != ',') {
+        flt_digits = (int)strtol(opt_arg, &ptr1, 10);
         float_precision_specified = 1;
     }
 
diff --git a/src/utils/ncmpidump/vardata.c b/src/utils/ncmpidump/vardata.c
index 11310a9f9..b8cec8b99 100644
--- a/src/utils/ncmpidump/vardata.c
+++ b/src/utils/ncmpidump/vardata.c
@@ -82,9 +82,9 @@ static double double_eps;
 static float
 float_epsilon(void)
 {
-    float float_eps;
+    float val_float_eps;
 #ifndef NO_FLOAT_H
-    float_eps = FLT_EPSILON;
+    val_float_eps = FLT_EPSILON;
 #else /* NO_FLOAT_H */
     {
 	float etop, ebot, eps;
@@ -103,19 +103,19 @@ float_epsilon(void)
 		ebot = eps;
 	    eps = ebot + (etop - ebot)/two;
 	}
-	float_eps = two * etop;
+	val_float_eps = two * etop;
     }
 #endif /* NO_FLOAT_H */
-    return float_eps;
+    return val_float_eps;
 }
 
 
 static double
 double_epsilon(void)
 {
-    double double_eps;
+    double val_double_eps;
 #ifndef NO_FLOAT_H
-    double_eps = DBL_EPSILON;
+    val_double_eps = DBL_EPSILON;
 #else /* NO_FLOAT_H */
     {
 	double etop, ebot, eps;
@@ -134,10 +134,10 @@ double_epsilon(void)
 		ebot = eps;
 	    eps = ebot + (etop - ebot)/two;
 	}
-	double_eps = two * etop;
+	val_double_eps = two * etop;
     }
 #endif /* NO_FLOAT_H */
-    return double_eps;
+    return val_double_eps;
 }
 
 
diff --git a/src/utils/ncmpigen/genlib.c b/src/utils/ncmpigen/genlib.c
index 1e558a92f..26ac82d89 100644
--- a/src/utils/ncmpigen/genlib.c
+++ b/src/utils/ncmpigen/genlib.c
@@ -1548,7 +1548,6 @@ cl_fortran(void)
                 }
                 fline(stmnt);
 		if (v->type != NC_CHAR) {
-		    char *sp;
 		    sprintf(stmnt, "%s  %s(", ncftype(v->type),
 			    v->lname);
 		    /* reverse dimensions for FORTRAN */
@@ -1582,12 +1581,12 @@ cl_fortran(void)
                 if (v->has_data) {
                     fline(v->data_stmnt);
                 } else {		/* generate data statement for FILL record */
-                    MPI_Offset rec_len = 1;
+                    MPI_Offset rec_size = 1;
                     for (idim = 1; idim < v->ndims; idim++) {
-                        rec_len *= dims[v->dims[idim]].size;
+                        rec_size *= dims[v->dims[idim]].size;
                     }
                     sprintf(stmnt,"data %s /%lu * %s/", v->lname,
-			(unsigned long) rec_len,
+			(unsigned long) rec_size,
                             f_fill_name(v->type));
                     fline(stmnt);
                 }
@@ -1695,9 +1694,9 @@ close_netcdf(void)
 
 
 void
-check_err(int stat, const char *ncmpi_func, const char *calling_func, int lineno, const char *calling_file) {
+check_err(int stat, const char *ncmpi_func, const char *calling_func, int linenum, const char *calling_file) {
     if (stat != NC_NOERR) {
-	fprintf(stderr, "ncmpigen error when calling %s in %s() at line %d of %s: %s\n", ncmpi_func, calling_func, lineno, calling_file, ncmpi_strerror(stat));
+	fprintf(stderr, "ncmpigen error when calling %s in %s() at line %d of %s: %s\n", ncmpi_func, calling_func, linenum, calling_file, ncmpi_strerror(stat));
 	derror_count++;
     }
 }
diff --git a/src/utils/ncmpigen/load.c b/src/utils/ncmpigen/load.c
index 69fe54e3a..788450aa2 100644
--- a/src/utils/ncmpigen/load.c
+++ b/src/utils/ncmpigen/load.c
@@ -394,7 +394,7 @@ fstrcat(
  */
 static void
 f_var_init(
-    int varnum,			/* which variable */
+    int varid,			/* which variable */
     void *rec_start		/* start of data */
     )
 {
@@ -415,9 +415,9 @@ f_var_init(
     int ival;
 
     /* load variable with data values  */
-    sprintf(stmnt, "data %s /",vars[varnum].lname);
+    sprintf(stmnt, "data %s /",vars[varid].lname);
     stmnt_len = strlen(stmnt);
-    switch (vars[varnum].type) {
+    switch (vars[varid].type) {
     case NC_BYTE:
 	charvalp = (char *) rec_start;
 	for (ival = 0; ival < var_len-1; ival++) {
@@ -524,10 +524,10 @@ f_var_init(
 
     /* For record variables, store data statement for later use;
       otherwise, just print it. */
-    if (vars[varnum].ndims > 0 && vars[varnum].dims[0] == rec_dim) {
+    if (vars[varid].ndims > 0 && vars[varid].dims[0] == rec_dim) {
 	char *dup_stmnt = (char*) emalloc(strlen(stmnt)+1);
 	strcpy(dup_stmnt, stmnt); /* ULTRIX missing strdup */
-	vars[varnum].data_stmnt = dup_stmnt;
+	vars[varid].data_stmnt = dup_stmnt;
     } else {
 	fline(stmnt);
     }
diff --git a/src/utils/ncmpigen/ncmpigentab.c b/src/utils/ncmpigen/ncmpigentab.c
index 117e7d494..069986788 100644
--- a/src/utils/ncmpigen/ncmpigentab.c
+++ b/src/utils/ncmpigen/ncmpigentab.c
@@ -1,6 +1,8 @@
+/*
 #ifndef lint
 static const char yysccsid[] = "@(#)yaccpar	1.9 (Berkeley) 02/21/93";
 #endif
+*/
 
 #include <stdlib.h>
 #include <string.h>
@@ -617,7 +619,6 @@ static int yygrowstack(void)
 #define YYABORT goto yyabort
 #define YYREJECT goto yyabort
 #define YYACCEPT goto yyaccept
-#define YYERROR goto yyerrlab
 int
 yyparse(void)
 {
@@ -686,11 +687,6 @@ yyparse(void)
 
     yyerror("syntax error");
 
-#ifdef lint
-    goto yyerrlab;
-#endif
-
-yyerrlab:
     ++yynerrs;
 
 yyinrecovery:
diff --git a/src/utils/ncoffsets/ncoffsets.c b/src/utils/ncoffsets/ncoffsets.c
index 977f199dd..1e2a32972 100644
--- a/src/utils/ncoffsets/ncoffsets.c
+++ b/src/utils/ncoffsets/ncoffsets.c
@@ -1802,9 +1802,9 @@ struct fspec {
 };
 
 static void
-make_lvars(char *optarg, struct fspec* fspecp)
+make_lvars(char *opt_arg, struct fspec* fspecp)
 {
-    char *cp = optarg;
+    char *cp = opt_arg;
     int nvars = 1;
     char ** cpp;
 
@@ -1819,7 +1819,7 @@ make_lvars(char *optarg, struct fspec* fspecp)
 
     cpp = fspecp->lvars;
     /* copy variable names into list */
-    for (cp = strtok(optarg, ",");
+    for (cp = strtok(opt_arg, ",");
          cp != NULL;
          cp = strtok((char *) NULL, ",")) {
 
@@ -2070,7 +2070,7 @@ int main(int argc, char *argv[])
     /* print fixed-size variables first */
     if (num_fix_vars) printf("\nfixed-size variables:\n");
     for (i=0; i<fspecp->nlvars; i++) {
-        int j, ndims, cdots;
+        int ndims, cdots;
         char type_str[16], str[1024], *line;
         size_t lineLen;
         long long size;
@@ -2162,7 +2162,7 @@ int main(int argc, char *argv[])
     /* print record variables */
     if (num_rec_vars) printf("\nrecord variables:\n");
     for (i=0; i<fspecp->nlvars; i++) {
-        int j, ndims, cdots;
+        int ndims, cdots;
         char type_str[16], str[1024], *line;
         size_t lineLen;
         long long var_begin, var_end, size, numrecs;
diff --git a/src/utils/ncvalidator/ncvalidator.c b/src/utils/ncvalidator/ncvalidator.c
index da58bcf6c..078d2abf5 100644
--- a/src/utils/ncvalidator/ncvalidator.c
+++ b/src/utils/ncvalidator/ncvalidator.c
@@ -1397,7 +1397,7 @@ val_get_NC_attr(int          fd,
                 NC_attr    **attrpp,
                 const char  *loc)
 {
-    char *name=NULL, xloc[1024];
+    char *name=NULL, xloc[2048];
     int err, status=NC_NOERR;
     size_t err_addr, name_len;
     nc_type xtype;
@@ -2401,7 +2401,7 @@ val_get_NC(int fd, NC *ncp)
 
     /* check zero padding in the blank space betwee header size and extent */
     if (repair && ncp->begin_var - ncp->xsz > 0) {
-        size_t i, gap = ncp->begin_var - ncp->xsz;
+        size_t gap = ncp->begin_var - ncp->xsz;
         ssize_t readLen;
         char *buf = (char*) malloc(gap);
 
@@ -2448,7 +2448,7 @@ val_get_NC(int fd, NC *ncp)
 
 #ifndef BUILD_CDFDIFF
 
-/* File system types recognized by ROMIO in MPICH 4.0.0 */
+/* File system types recognized by ROMIO in MPICH 4.0.0, and by PnetCDF */
 static const char* fstypes[] = {"ufs", "nfs", "xfs", "pvfs2", "gpfs", "panfs", "lustre", "daos", "testfs", "ime", "quobyte", NULL};
 
 /* Return a pointer to filename by removing the file system type prefix name if
diff --git a/test/C/Makefile.am b/test/C/Makefile.am
index 4d0668b6f..9336c3aec 100644
--- a/test/C/Makefile.am
+++ b/test/C/Makefile.am
@@ -68,7 +68,7 @@ ptest ptests ptest4: $(TESTPROGRAMS)
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 || exit 1
 
 ptest2 ptest6 ptest8 ptest10:
 
diff --git a/test/C/parallel_run.sh b/test/C/parallel_run.sh
index 9fe1f41a8..76d9b9acb 100755
--- a/test/C/parallel_run.sh
+++ b/test/C/parallel_run.sh
@@ -17,7 +17,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "MPIRUN = ${MPIRUN}"
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -27,17 +27,46 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+        if test "$j" = 1 ; then # test only in safe mode
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
+        else
+           USEMPIO_HINTS="nc_pncio=enable"
+        fi
 for intra_aggr in 0 1 ; do
-    if test "$j" = 1 ; then # test only in safe mode
-       export PNETCDF_HINTS="romio_no_indep_rw=true"
-    else
-       export PNETCDF_HINTS=
-    fi
-    if test "$intra_aggr" = 1 ; then
-       export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
-    fi
-    export PNETCDF_SAFE_MODE=$j
-    # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+        if test "$intra_aggr" = 1 ; then
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
+        export PNETCDF_SAFE_MODE=$j
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${MPIRUN} ./pres_temp_4D_wr ${TESTOUTDIR}/pres_temp_4D.nc
     ${MPIRUN} ./pres_temp_4D_rd ${TESTOUTDIR}/pres_temp_4D.nc
     # echo "--- validating file ${TESTOUTDIR}/pres_temp_4D.nc"
@@ -67,6 +96,7 @@ for intra_aggr in 0 1 ; do
     fi
 done
 done
+done
 
 rm -f ${OUTDIR}/*.nc
 rm -f ${OUTDIR}/*.nc4
diff --git a/test/C/pres_temp_4D_rd.c b/test/C/pres_temp_4D_rd.c
index 84257c3b0..76078d798 100644
--- a/test/C/pres_temp_4D_rd.c
+++ b/test/C/pres_temp_4D_rd.c
@@ -191,9 +191,12 @@ int main(int argc, char **argv)
             }
             i++;
         }
-    } /* next record */
 
 fn_exit:
+        MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+        if (nerrs > 0) break;
+    } /* next record */
+
     /* Close the file. */
     err = ncmpi_close(ncid);
     CHECK_ERR
diff --git a/test/C/seq_runs.sh b/test/C/seq_runs.sh
index 1098e2896..9bab67c2c 100755
--- a/test/C/seq_runs.sh
+++ b/test/C/seq_runs.sh
@@ -23,8 +23,30 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${TESTSEQRUN} ./pres_temp_4D_wr ${TESTOUTDIR}/pres_temp_4D.nc
     ${TESTSEQRUN} ./pres_temp_4D_rd ${TESTOUTDIR}/pres_temp_4D.nc
     # echo "--- validating file ${TESTOUTDIR}/pres_temp_4D.nc"
@@ -54,5 +76,6 @@ for j in ${safe_modes} ; do
     fi
     # echo ""
 done
+done
 rm -f ${OUTDIR}/*.nc
 rm -f ${OUTDIR}/*.nc4
diff --git a/test/CXX/Makefile.am b/test/CXX/Makefile.am
index 532133e05..374be8757 100644
--- a/test/CXX/Makefile.am
+++ b/test/CXX/Makefile.am
@@ -60,7 +60,7 @@ ptest ptests ptest4: $(TESTPROGRAMS)
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 || exit 1
 
 ptest2 ptest6 ptest8 ptest10:
 
diff --git a/test/CXX/nctst.cpp b/test/CXX/nctst.cpp
index 84cd1a4ad..41aca948b 100644
--- a/test/CXX/nctst.cpp
+++ b/test/CXX/nctst.cpp
@@ -568,7 +568,7 @@ main(int argc, char* argv[])	// test new netCDF interface
     if (err == NC_NOERR) {
         MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD);
         if (rank == 0 && sum_size > 0)
-            printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n",
+            printf("heap memory allocated by PnetCDF internally has " OFFFMT " bytes yet to be freed\n",
                    sum_size);
     }
 
diff --git a/test/CXX/parallel_run.sh b/test/CXX/parallel_run.sh
index 4f887ad27..a7efc1485 100755
--- a/test/CXX/parallel_run.sh
+++ b/test/CXX/parallel_run.sh
@@ -18,7 +18,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "TESTPROGRAMS=${TESTPROGRAMS}"
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -29,17 +29,46 @@ unset PNETCDF_HINTS
 
 for i in ${TESTPROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
         fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
         ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc
 
         # echo "--- validating file ${TESTOUTDIR}/$i.nc"
@@ -67,6 +96,7 @@ for i in ${TESTPROGRAMS} ; do
         fi
     done
     done
+    done
     rm -f ${OUTDIR}/$i.nc
     rm -f ${OUTDIR}/$i.bb.nc
 done
diff --git a/test/CXX/test_classic.cpp b/test/CXX/test_classic.cpp
index c600181af..a38182de6 100644
--- a/test/CXX/test_classic.cpp
+++ b/test/CXX/test_classic.cpp
@@ -88,7 +88,7 @@ int main( int argc, char *argv[] )
     if (err == NC_NOERR) {
         MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD);
         if (rank == 0 && sum_size > 0)
-            printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n",
+            printf("heap memory allocated by PnetCDF internally has " OFFFMT " bytes yet to be freed\n",
                    sum_size);
     }
 
diff --git a/test/CXX/wrap_runs.sh b/test/CXX/wrap_runs.sh
index d34ca7b47..49bfef8d2 100755
--- a/test/CXX/wrap_runs.sh
+++ b/test/CXX/wrap_runs.sh
@@ -26,8 +26,30 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${TESTSEQRUN} ./$1            ${TESTOUTDIR}/$outfile.nc
     ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc
     # echo ""
@@ -36,7 +58,7 @@ for j in ${safe_modes} ; do
        echo ""
        echo "---- testing burst buffering"
 
-       export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+       export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
        ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.bb.nc
        unset PNETCDF_HINTS
        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc
@@ -45,6 +67,7 @@ for j in ${safe_modes} ; do
        ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$outfile.nc ${TESTOUTDIR}/$outfile.bb.nc
     fi
 done
+done
 rm -f ${OUTDIR}/$outfile.nc
 rm -f ${OUTDIR}/$outfile.bb.nc
 
diff --git a/test/F90/Makefile.am b/test/F90/Makefile.am
index 4d4812767..5249744f9 100644
--- a/test/F90/Makefile.am
+++ b/test/F90/Makefile.am
@@ -86,28 +86,28 @@ ptest ptest4: $(PARALLEL_PROGS)
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 || exit 1
 
 ptest2: $(PARALLEL_PROGS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 2 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 2 || exit 1
+	$(srcdir)/../parallel_run.sh 2 || exit 1
 
 ptest8: $(PARALLEL_PROGS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 8 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 8 || exit 1
+	$(srcdir)/../parallel_run.sh 8 || exit 1
 
 ptest10: $(PARALLEL_PROGS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 10 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 10 || exit 1
+	$(srcdir)/../parallel_run.sh 10 || exit 1
 
 ptests: ptest2 ptest4 ptest8 ptest10
 ptest6:
diff --git a/test/F90/f90tst_parallel.f90 b/test/F90/f90tst_parallel.f90
index d6223a334..4af5f9705 100644
--- a/test/F90/f90tst_parallel.f90
+++ b/test/F90/f90tst_parallel.f90
@@ -100,6 +100,9 @@ program f90tst_parallel
   ! Define the variable.
   call handle_err(nf90mpi_def_var(ncid, "data", NF90_INT, dimids, varid))
 
+  ! fill with default fill value
+  call handle_err(nf90mpi_def_var_fill(ncid, varid, 0, NF90_FILL_INT))
+
   call handle_err(nf90mpi_enddef(ncid))
 
   ! Determine what part of the variable will be written for this
diff --git a/test/F90/f90tst_parallel4.f90 b/test/F90/f90tst_parallel4.f90
index b545fa847..70c652bba 100644
--- a/test/F90/f90tst_parallel4.f90
+++ b/test/F90/f90tst_parallel4.f90
@@ -45,10 +45,12 @@ program f90tst
   call handle_err(nf90mpi_def_dim(fh, 'dim2', 4_MPI_OFFSET_KIND, dimid(2)))
   call handle_err(nf90mpi_def_dim(fh, 'dim3', 1_MPI_OFFSET_KIND, dimid(3)))
 
-
   call handle_err(nf90mpi_def_var(fh, 'var1', NF90_DOUBLE, dimid, varid))
-  call handle_err(nf90mpi_enddef(fh))
 
+  ! fill with default fill value
+  call handle_err(nf90mpi_def_var_fill(fh, varid, 0, NF90_FILL_DOUBLE))
+
+  call handle_err(nf90mpi_enddef(fh))
 
   do i=1,3
      f(i) = my_rank*3+i
diff --git a/test/F90/f90tst_vars.f90 b/test/F90/f90tst_vars.f90
index 7f43a6908..e114fddf7 100644
--- a/test/F90/f90tst_vars.f90
+++ b/test/F90/f90tst_vars.f90
@@ -39,6 +39,7 @@ program f90tst_vars
   call MPI_Comm_size(MPI_COMM_WORLD, p, ierr)
 
   ! take filename from command-line argument if there is any
+  cmd = ' '
   if (my_rank .EQ. 0) then
       filename = FILE_NAME
       err = get_args(cmd, filename)
@@ -78,6 +79,9 @@ program f90tst_vars
   ! Define the variable.
   call handle_err(nf90mpi_def_var(ncid, "data", NF90_INT, dimids, varid))
 
+  ! fill with default fill value
+  call handle_err(nf90mpi_def_var_fill(ncid, varid, 0, NF90_FILL_INT))
+
   ! With classic model netCDF-4 file, enddef must be called.
   call handle_err(nf90mpi_enddef(ncid))
 
diff --git a/test/F90/f90tst_vars2.f90 b/test/F90/f90tst_vars2.f90
index d2011970e..1defbf9bf 100644
--- a/test/F90/f90tst_vars2.f90
+++ b/test/F90/f90tst_vars2.f90
@@ -24,9 +24,8 @@ program f90tst_vars2
 
   ! We need these ids and other gunk for netcdf.
   integer :: ncid, varid1, varid2, varid3, varid4, varid5, dimids(MAX_DIMS)
-  integer :: x_dimid, y_dimid
+  integer :: x, y, x_dimid, y_dimid, old_fillmode
   integer :: nvars, ngatts, ndims, unlimdimid, file_format
-  integer :: x, y
   integer, parameter :: DEFLATE_LEVEL = 4
   integer, parameter :: EightByteInt = selected_int_kind(18)
   integer (kind = EightByteInt) :: TOE_SAN_VALUE = 2147483648_EightByteInt
@@ -53,6 +52,7 @@ program f90tst_vars2
   call MPI_Comm_size(MPI_COMM_WORLD, p, ierr)
 
   ! take filename from command-line argument if there is any
+  cmd = ' '
   if (my_rank .EQ. 0) then
       filename = FILE_NAME
       err = get_args(cmd, filename)
@@ -94,6 +94,8 @@ program f90tst_vars2
   call check(nf90mpi_def_var(ncid, VAR4_NAME, NF90_INT, x_dimid, varid4))
   call check(nf90mpi_def_var(ncid, VAR5_NAME, NF90_INT, dimids, varid5))
 
+  call check(nf90mpi_set_fill(ncid, NF90_FILL, old_fillmode))
+
   call check(nf90mpi_enddef(ncid))
 
   ! enter independent data mode
diff --git a/test/F90/f90tst_vars3.f90 b/test/F90/f90tst_vars3.f90
index 88fadc569..7c712bd0f 100644
--- a/test/F90/f90tst_vars3.f90
+++ b/test/F90/f90tst_vars3.f90
@@ -24,9 +24,8 @@ program f90tst_vars3
 
   ! We need these ids and other gunk for netcdf.
   integer :: ncid, varid1, varid2, varid3, varid4, varid5, dimids(MAX_DIMS)
-  integer :: x_dimid, y_dimid
+  integer :: x, y, x_dimid, y_dimid, old_fillmode
   integer :: nvars, ngatts, ndims, unlimdimid, file_format
-  integer :: x, y
   integer, parameter :: DEFAULT_CACHE_NELEMS = 10000, DEFAULT_CACHE_SIZE = 1000000
   integer, parameter :: DEFAULT_CACHE_PREEMPTION = 22
   integer, parameter :: DEFLATE_LEVEL = 4
@@ -54,6 +53,7 @@ program f90tst_vars3
   call MPI_Comm_size(MPI_COMM_WORLD, p, ierr)
 
   ! take filename from command-line argument if there is any
+  cmd = ' '
   if (my_rank .EQ. 0) then
       filename = FILE_NAME
       err = get_args(cmd, filename)
@@ -95,6 +95,8 @@ program f90tst_vars3
   call check(nf90mpi_def_var(ncid, VAR4_NAME, NF90_INT, x_dimid, varid4))
   call check(nf90mpi_def_var(ncid, VAR5_NAME, NF90_INT, dimids, varid5))
 
+  call check(nf90mpi_set_fill(ncid, NF90_FILL, old_fillmode))
+
   call check(nf90mpi_enddef(ncid))
   call check(nf90mpi_begin_indep_data(ncid))
 
diff --git a/test/F90/f90tst_vars4.f90 b/test/F90/f90tst_vars4.f90
index 1104246e3..6f48a638f 100644
--- a/test/F90/f90tst_vars4.f90
+++ b/test/F90/f90tst_vars4.f90
@@ -22,10 +22,9 @@ program f90tst_vars4
 
   ! We need these ids and other gunk for netcdf.
   integer :: ncid, varid, dimids(MAX_DIMS)
-  integer :: x_dimid, y_dimid
+  integer :: x, y, x_dimid, y_dimid, old_fillmode
   integer :: mode_flag
   integer :: nvars, ngatts, ndims, unlimdimid, file_format
-  integer :: x, y
   integer, parameter :: CACHE_SIZE = 1000000
   integer :: xtype_in, natts_in, dimids_in(MAX_DIMS)
   character (len = NF90_MAX_NAME) :: name_in
@@ -39,6 +38,7 @@ program f90tst_vars4
   call MPI_Comm_size(MPI_COMM_WORLD, p, ierr)
 
   ! take filename from command-line argument if there is any
+  cmd = ' '
   if (my_rank .EQ. 0) then
       filename = FILE_NAME
       err = get_args(cmd, filename)
@@ -73,6 +73,8 @@ program f90tst_vars4
   ! Define the variable.
   call handle_err(nf90mpi_def_var(ncid, 'data', NF90_INT, dimids, varid))
 
+  call handle_err(nf90mpi_set_fill(ncid, NF90_FILL, old_fillmode))
+
   ! enddef must be called.
   call handle_err(nf90mpi_enddef(ncid))
 
diff --git a/test/F90/parallel_run.sh b/test/F90/parallel_run.sh
index da29ea3bc..0e2be8992 100755
--- a/test/F90/parallel_run.sh
+++ b/test/F90/parallel_run.sh
@@ -18,7 +18,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "PARALLEL_PROGS=${PARALLEL_PROGS}"
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -29,17 +29,47 @@ unset PNETCDF_HINTS
 
 for i in ${PARALLEL_PROGS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
         fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
+        # echo "${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc"
         ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc
 
         # echo "--- validating file ${TESTOUTDIR}/$i.nc"
@@ -69,6 +99,7 @@ for i in ${PARALLEL_PROGS} ; do
         fi
     done
     done
+    done
     rm -f ${OUTDIR}/$i.nc
     rm -f ${OUTDIR}/$i.bb.nc
 done
diff --git a/test/F90/test_attr_int64.f90 b/test/F90/test_attr_int64.f90
index 19b19183c..d8a821413 100644
--- a/test/F90/test_attr_int64.f90
+++ b/test/F90/test_attr_int64.f90
@@ -38,6 +38,7 @@ program main
           call MPI_Comm_rank(MPI_COMM_WORLD, rank, err)
 
           ! take filename from command-line argument if there is any
+          cmd = ' '
           if (rank .EQ. 0) then
               filename = 'testfile.nc'
               err = get_args(cmd, filename)
diff --git a/test/F90/test_fill.f90 b/test/F90/test_fill.f90
index a35f52467..c63b1f63e 100644
--- a/test/F90/test_fill.f90
+++ b/test/F90/test_fill.f90
@@ -29,14 +29,16 @@ integer function tst_fmt(filename, mode)
           implicit none
 
           character(LEN=256) filename
-          integer i, err, ierr, rank
+          integer i, err, ierr, rank, nprocs
           integer :: ncid, mode, cmode, dimid(1), varid
           integer(kind=MPI_OFFSET_KIND) :: start(1)
           integer(kind=MPI_OFFSET_KIND) :: count(1)
+          integer(kind=MPI_OFFSET_KIND) :: dim_len
           integer(kind=MPI_OFFSET_KIND), parameter :: len = 3
           integer, parameter :: k = selected_int_kind(18)
           integer(kind=k) :: buf(len)
 
+          call MPI_Comm_size(MPI_COMM_WORLD, nprocs, ierr)
           call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
 
           tst_fmt = 0
@@ -51,7 +53,8 @@ integer function tst_fmt(filename, mode)
           call check(err, 'In nf90mpi_create: ')
           tst_fmt = tst_fmt + err
 
-          err = nf90mpi_def_dim(ncid, "dim", len, dimid(1))
+          dim_len = len * nprocs
+          err = nf90mpi_def_dim(ncid, "dim", dim_len, dimid(1))
           call check(err, 'In nf90mpi_def_dim: ')
           tst_fmt = tst_fmt + err
 
@@ -74,7 +77,7 @@ integer function tst_fmt(filename, mode)
           tst_fmt = tst_fmt + err
 
           ! Write buf
-          start(1) = 1
+          start(1) = len * rank + 1
           count(1) = len
           err = nf90mpi_put_var_all(ncid, varid, buf, start, count)
           call check(err, 'In nf90mpi_put_var_all: ')
@@ -97,6 +100,7 @@ program test
           call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
 
           ! take filename from command-line argument if there is any
+          cmd = ' '
           if (rank .EQ. 0) then
               filename = 'testfile.nc'
               err = get_args(cmd, filename)
diff --git a/test/F90/test_intent.f90 b/test/F90/test_intent.f90
index 70612ca86..b5d096b4a 100644
--- a/test/F90/test_intent.f90
+++ b/test/F90/test_intent.f90
@@ -63,6 +63,7 @@ program main
           call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
 
           ! take filename from command-line argument if there is any
+          cmd = ' '
           if (rank .EQ. 0) then
               filename = 'testfile.nc'
               err = get_args(cmd, filename)
@@ -109,34 +110,38 @@ program main
           call check(err, 'In nfmpi_put_att_int8: ')
 
           ! define a variable of an integer array of size 3 in the nc file
-          err = nfmpi_def_dim(ncid, 'X', 3_MPI_OFFSET_KIND, dimid(1))
-          call check(err, 'In nfmpi_def_dim: ')
+          err = nf90mpi_def_dim(ncid, 'X', 3_MPI_OFFSET_KIND, dimid(1))
+          call check(err, 'In nf90mpi_def_dim: ')
 
-          err = nfmpi_def_var(ncid, 'var', NF90_INT, 1, dimid, varid)
-          call check(err, 'In nfmpi_def_var: ')
+          err = nf90mpi_def_var(ncid, 'var', NF90_INT, dimid, varid)
+          call check(err, 'In nf90mpi_def_var: ')
 
-          err = nfmpi_enddef(ncid)
-          call check(err, 'In nfmpi_enddef: ')
+          ! fill with default fill value
+          err = nf90mpi_def_var_fill(ncid, varid, 0, NF90_FILL_INT)
+          call check(err, 'In nf90mpi_def_var_fill: ')
+
+          err = nf90mpi_enddef(ncid)
+          call check(err, 'In nf90mpi_enddef: ')
 
           ! bufsize must be max of data type converted before and after
           bufsize = 3*4
-          err = nfmpi_buffer_attach(ncid, bufsize)
-          call check(err, 'In nfmpi_buffer_attach: ')
+          err = nf90mpi_buffer_attach(ncid, bufsize)
+          call check(err, 'In nf90mpi_buffer_attach: ')
 
           start(1) = 1
           count(1) = 3
-          err = nfmpi_bput_vara_int(ncid, varid, start, count, ibuf, req(1))
+          err = nfmpi_bput_vara_int(ncid, varid, start, count, ibuf(1:), req(1))
           call check(err, 'In nfmpi_bput_vara_int: ')
 
-          err = nfmpi_wait_all(ncid, 1, req, status)
-          call check(err, 'In nfmpi_wait_all: ')
+          err = nf90mpi_wait_all(ncid, 1, req, status)
+          call check(err, 'In nf90mpi_wait_all: ')
 
           if (status(1) .ne. NF90_NOERR) then
-              print*,'Error at bput status ', nfmpi_strerror(status(1))
+              print*,'Error at bput status ', nf90mpi_strerror(status(1))
           endif
 
-          err = nfmpi_buffer_detach(ncid)
-          call check(err, 'In nfmpi_buffer_detach: ')
+          err = nf90mpi_buffer_detach(ncid)
+          call check(err, 'In nf90mpi_buffer_detach: ')
 
           ! close the file
           err = nf90mpi_close(ncid)
diff --git a/test/F90/tst_f90.f90 b/test/F90/tst_f90.f90
index 3369556c9..43f789938 100644
--- a/test/F90/tst_f90.f90
+++ b/test/F90/tst_f90.f90
@@ -82,6 +82,7 @@ program netcdfTest
   call MPI_Comm_size(MPI_COMM_WORLD, p, ierr)
 
   ! take filename from command-line argument if there is any
+  cmd = ' '
   if (my_rank .EQ. 0) then
       filename = FILE_NAME
       err = get_args(cmd, filename)
diff --git a/test/F90/tst_f90_cdf5.f90 b/test/F90/tst_f90_cdf5.f90
index eae87baee..3432524cb 100644
--- a/test/F90/tst_f90_cdf5.f90
+++ b/test/F90/tst_f90_cdf5.f90
@@ -22,6 +22,7 @@ program tst_f90_nc4
   call MPI_Comm_size(MPI_COMM_WORLD, p, ierr)
 
   ! take filename from command-line argument if there is any
+  cmd = ' '
   if (my_rank .EQ. 0) then
       filename = FILE_NAME
       err = get_args(cmd, filename)
diff --git a/test/F90/tst_types2.f90 b/test/F90/tst_types2.f90
index f1506eb76..a9f8e7b6b 100644
--- a/test/F90/tst_types2.f90
+++ b/test/F90/tst_types2.f90
@@ -43,6 +43,7 @@ program tst_types2
   call MPI_Comm_size(MPI_COMM_WORLD, p, ierr)
 
   ! take filename from command-line argument if there is any
+  cmd = ' '
   if (my_rank .EQ. 0) then
       filename = FILE_NAME
       err = get_args(cmd, filename)
diff --git a/test/F90/wrap_runs.sh b/test/F90/wrap_runs.sh
index 716aacf06..fcfb29fe9 100755
--- a/test/F90/wrap_runs.sh
+++ b/test/F90/wrap_runs.sh
@@ -26,8 +26,30 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.nc
     ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc
     # echo ""
@@ -36,7 +58,7 @@ for j in ${safe_modes} ; do
        echo ""
        echo "---- testing burst buffering"
 
-       export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+       export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
        ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.bb.nc
        unset PNETCDF_HINTS
        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc
@@ -49,6 +71,7 @@ for j in ${safe_modes} ; do
        fi
     fi
 done
+done
 rm -f ${OUTDIR}/$outfile.nc
 rm -f ${OUTDIR}/$outfile.bb.nc
 
diff --git a/test/Makefile.am b/test/Makefile.am
index f5eb9d5b4..7d9407403 100644
--- a/test/Makefile.am
+++ b/test/Makefile.am
@@ -64,6 +64,8 @@ else
    PTEST_SUBDIRS = $(SUBDIRS)
 endif
 
+EXTRA_DIST = parallel_run.sh
+
 ptest:
 	@for d in $(PTEST_SUBDIRS) ; do \
 		$(MAKE) $(MFLAGS) -C $$d ptest $$* || exit 1 ; \
diff --git a/test/adios/parallel_run.sh b/test/adios/parallel_run.sh
index 612fd7591..a6602f77c 100755
--- a/test/adios/parallel_run.sh
+++ b/test/adios/parallel_run.sh
@@ -15,7 +15,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "check_PROGRAMS=${check_PROGRAMS}"
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -26,16 +26,46 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
         fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
         # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
         if test "$i" = open ; then
            ${MPIRUN} ./$i ${srcdir}/arrays.bp
diff --git a/test/adios/wrap_runs.sh b/test/adios/wrap_runs.sh
index e619098d0..d7647dd15 100755
--- a/test/adios/wrap_runs.sh
+++ b/test/adios/wrap_runs.sh
@@ -22,8 +22,19 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     if test "$1" = ./open ; then
        ${TESTSEQRUN} $1 ${srcdir}/arrays.bp
        ${TESTSEQRUN} $1 ${srcdir}/attributes.bp
diff --git a/test/burst_buffer/parallel_run.sh b/test/burst_buffer/parallel_run.sh
index d726ee760..73b9d39a1 100755
--- a/test/burst_buffer/parallel_run.sh
+++ b/test/burst_buffer/parallel_run.sh
@@ -18,7 +18,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "TESTPROGRAMS=${TESTPROGRAMS}"
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -29,17 +29,45 @@ unset PNETCDF_HINTS
 
 for i in ${TESTPROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
         fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
 
         saved_PNETCDF_HINTS=${PNETCDF_HINTS}
         export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
@@ -59,6 +87,7 @@ for i in ${TESTPROGRAMS} ; do
         ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc
     done
     done
+    done
     rm -f ${OUTDIR}/$i.nc
     rm -f ${OUTDIR}/$i.nc*.data
     rm -f ${OUTDIR}/$i.nc*.meta
diff --git a/test/burst_buffer/varn.c b/test/burst_buffer/varn.c
index 81e93063e..3913f3700 100644
--- a/test/burst_buffer/varn.c
+++ b/test/burst_buffer/varn.c
@@ -54,7 +54,7 @@ int main(int argc, char *argv[]) {
                 free(cmd_str);
     }
 
-     /* Initialize file info */
+    /* Initialize file info */
     MPI_Info_create(&info);
     MPI_Info_set(info, "nc_burst_buf", "enable");
 
@@ -83,21 +83,25 @@ int main(int argc, char *argv[]) {
 
     /* Standard varn */
     err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, Counts, buffer);    CHECK_ERR
+    for (i=0; i<10; i++) buffer[0] = -1;
     err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, Counts, buffer);    CHECK_ERR
     for(i = 0; i < 10; i++){
         if (buffer[i] != rank + i){
-            nerrs++;
             printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]);
+            nerrs++;
+            goto err_out;
         }
     }
 
     /* NULL counts */
     err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, NULL, buffer);    CHECK_ERR
+    for (i=0; i<10; i++) buffer[0] = -1;
     err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, NULL, buffer);    CHECK_ERR
     for(i = 0; i < 10; i++){
         if (buffer[i] != rank + i){
-            nerrs++;
             printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]);
+            nerrs++;
+            goto err_out;
         }
     }
 
@@ -106,14 +110,17 @@ int main(int argc, char *argv[]) {
         Counts[i] = (MPI_Offset*)counts[i];
     }
     err = ncmpi_put_varn_int_all(ncid, varid, 10, Starts, Counts, buffer);    CHECK_ERR
+    for (i=0; i<10; i++) buffer[0] = -1;
     err = ncmpi_get_varn_int_all(ncid, varid, 10, Starts, Counts, buffer);    CHECK_ERR
     for(i = 0; i < 10; i++){
         if (buffer[i] != rank + i){
-            nerrs++;
             printf("Error at line %d in %s: expecting buffer[%d] = %d but got %d\n", __LINE__, __FILE__, i, rank + 1, buffer[i]);
+            nerrs++;
+            goto err_out;
         }
     }
 
+err_out:
     /* Close the file */
     err = ncmpi_close(ncid);    CHECK_ERR
 
diff --git a/test/burst_buffer/wrap_runs.sh b/test/burst_buffer/wrap_runs.sh
index 308ccfc19..e7cfeda85 100755
--- a/test/burst_buffer/wrap_runs.sh
+++ b/test/burst_buffer/wrap_runs.sh
@@ -24,13 +24,39 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+for bb_mode in 1 ; do
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+    if test "$bb_mode" = 1 ; then
+       PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+    fi
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
-    export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.nc
     unset PNETCDF_HINTS
     ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc
 done
+done
+done
 
 rm -f ${OUTDIR}/$outfile.nc
 rm -f ${OUTDIR}/$outfile.nc_0_0.data
diff --git a/test/cdf_format/Makefile.am b/test/cdf_format/Makefile.am
index a8d18918f..d7883e94f 100644
--- a/test/cdf_format/Makefile.am
+++ b/test/cdf_format/Makefile.am
@@ -75,7 +75,7 @@ ptest ptests ptest4: $(TESTPROGRAMS)
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 || exit 1
 
 ptest2 ptest6 ptest8 ptest10:
 
diff --git a/test/cdf_format/parallel_run.sh b/test/cdf_format/parallel_run.sh
index 9f95d0813..dc8677ebf 100755
--- a/test/cdf_format/parallel_run.sh
+++ b/test/cdf_format/parallel_run.sh
@@ -19,7 +19,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "srcdir = ${srcdir}"
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -29,17 +29,46 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+        if test "$j" = 1 ; then # test only in safe mode
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
+        else
+           USEMPIO_HINTS="nc_pncio=enable"
+        fi
 for intra_aggr in 0 1 ; do
-    if test "$j" = 1 ; then # test only in safe mode
-       export PNETCDF_HINTS="romio_no_indep_rw=true"
-    else
-       export PNETCDF_HINTS=
-    fi
-    if test "$intra_aggr" = 1 ; then
-       export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
-    fi
-    export PNETCDF_SAFE_MODE=$j
-    # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+        if test "$intra_aggr" = 1 ; then
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
+        export PNETCDF_SAFE_MODE=$j
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${MPIRUN} ./test_inq_format ${srcdir}
     ${MPIRUN} ./cdf_type ${TESTOUTDIR}/cdf_type.nc
     ${MPIRUN} ./dim_cdf12 ${TESTOUTDIR}/dim_cdf12.nc
@@ -70,6 +99,7 @@ for intra_aggr in 0 1 ; do
     fi
 done
 done
+done
 
 rm -f ${OUTDIR}/dim_cdf12.nc
 rm -f ${OUTDIR}/cdf_type.nc
diff --git a/test/cdf_format/wrap_runs.sh b/test/cdf_format/wrap_runs.sh
index c749c7dbe..f0370fd40 100755
--- a/test/cdf_format/wrap_runs.sh
+++ b/test/cdf_format/wrap_runs.sh
@@ -26,8 +26,30 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.nc
     ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc
 
@@ -35,7 +57,7 @@ for j in ${safe_modes} ; do
        echo ""
        echo "---- testing burst buffering"
 
-       export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+       export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
        ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.bb.nc
        unset PNETCDF_HINTS
        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc
@@ -48,5 +70,6 @@ for j in ${safe_modes} ; do
        fi
     fi
 done
+done
 rm -f ${OUTDIR}/$outfile.nc
 rm -f ${OUTDIR}/$outfile.bb.nc
diff --git a/test/cdl/Makefile.am b/test/cdl/Makefile.am
index 493d5ccd2..81677ad01 100644
--- a/test/cdl/Makefile.am
+++ b/test/cdl/Makefile.am
@@ -54,7 +54,7 @@ ptest ptests ptest4: $(TESTPROGRAMS)
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 || exit 1
 
 ptest2 ptest6 ptest8 ptest10:
 
diff --git a/test/common/testutils.h b/test/common/testutils.h
index 0dadd1c35..ef3d2593f 100644
--- a/test/common/testutils.h
+++ b/test/common/testutils.h
@@ -16,6 +16,9 @@
 #include <string.h>
 #include <limits.h>
 
+#define MODE_COLL  1
+#define MODE_INDEP 0
+
 #define CHECK_ERR { \
     if (err != NC_NOERR) { \
         nerrs++; \
@@ -24,6 +27,16 @@
     } \
 }
 
+#define CHECK_ERR_ALL { \
+    if (err != NC_NOERR) { \
+        nerrs++; \
+        printf("Error at line %d in %s: (%s)\n", \
+        __LINE__,__FILE__,ncmpi_strerrno(err)); \
+    } \
+    MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); \
+    if (nerrs > 0) goto fn_exit; \
+}
+
 #define CHECK_ERROUT { \
     if (err != NC_NOERR) { \
         nerrs++; \
@@ -38,7 +51,7 @@
         nerrs++; \
         printf("Error at line %d in %s: (%s)\n", \
         __LINE__,__FILE__,ncmpi_strerrno(err)); \
-        MPI_Abort(MPI_COMM_WORLD, -1); \
+        goto fn_exit; \
     } \
 }
 
@@ -50,6 +63,21 @@
     } \
 }
 
+#define CHECK_EXP_ERR_ALL(exp) { \
+    if (err != exp) { \
+        nerrs++; \
+        printf("Error at line %d in %s: expecting %s but got %s\n", \
+        __LINE__,__FILE__,ncmpi_strerrno(exp), ncmpi_strerrno(err)); \
+    } \
+    MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); \
+    if (nerrs > 0) goto fn_exit; \
+}
+
+#define CHECK_NERRS_ALL { \
+    MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); \
+    if (nerrs > 0) goto fn_exit; \
+}
+
 int inq_env_hint(char *hint_key, char **hint_value);
 
 #ifdef PNETCDF_DEBUG
diff --git a/test/fandc/csnap.c b/test/fandc/csnap.c
index 2443086e5..7f81806cc 100644
--- a/test/fandc/csnap.c
+++ b/test/fandc/csnap.c
@@ -64,7 +64,7 @@ int pe_coords[3];                           /* Cartesian PE coords */
 
 /*** function prototypes ***/
 
-void find_locnx(MPI_Offset nx, int mype, int totpes, MPI_Offset *locnx, MPI_Offset *xbegin);
+void find_locnx(MPI_Offset nx, int rank, int nprocs, MPI_Offset *locnx, MPI_Offset *xbegin);
 void write_file(char *filename, double *t);
 void read_file(char *filename, double *t);
 void get_fields(double *tt, double *smf);
@@ -390,14 +390,14 @@ void read_file(char *filename, double *t) {
 }
 
 
-void find_locnx(MPI_Offset nx, int mype, int totpes, MPI_Offset *locnx, MPI_Offset *xbegin) {
+void find_locnx(MPI_Offset nx, int rank, int nprocs, MPI_Offset *locnx, MPI_Offset *xbegin) {
   MPI_Offset xremain;
 
-  *locnx = nx / totpes;
-  xremain = nx - totpes*(*locnx);
-  if (mype < xremain) (*locnx)++;
-  *xbegin = mype*(nx/totpes) + xremain;
-  if (mype < xremain) *xbegin += mype - xremain;
+  *locnx = nx / nprocs;
+  xremain = nx - nprocs*(*locnx);
+  if (rank < xremain) (*locnx)++;
+  *xbegin = rank*(nx/nprocs) + xremain;
+  if (rank < xremain) *xbegin += rank - xremain;
 }
 
 
diff --git a/test/header/Makefile.am b/test/header/Makefile.am
index 8080f6885..2dddc7e44 100644
--- a/test/header/Makefile.am
+++ b/test/header/Makefile.am
@@ -60,14 +60,14 @@ ptest ptest4: $(check_PROGRAMS)
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 || exit 1
 
 ptest2: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 2 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 2 || exit 1
+	$(srcdir)/../parallel_run.sh 2 || exit 1
 
 ptests: ptest2 ptest4
 ptest6 ptest8 ptest10:
diff --git a/test/header/parallel_run.sh b/test/header/parallel_run.sh
index a4d0770fa..a339fd83f 100755
--- a/test/header/parallel_run.sh
+++ b/test/header/parallel_run.sh
@@ -18,7 +18,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "check_PROGRAMS=${check_PROGRAMS}"
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -29,16 +29,46 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
         fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
         # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
         ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc
 
@@ -68,6 +98,7 @@ for i in ${check_PROGRAMS} ; do
         fi
     done
     done
+    done
     rm -f ${OUTDIR}/$i.nc
     rm -f ${OUTDIR}/$i.bb.nc
 done
diff --git a/test/header/seq_runs.sh b/test/header/seq_runs.sh
index 475c8da8a..f35af292e 100755
--- a/test/header/seq_runs.sh
+++ b/test/header/seq_runs.sh
@@ -24,8 +24,30 @@ unset PNETCDF_HINTS
 
 # header consistency tests are designed to run on more than one MPI process
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.nc
     ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc
 
@@ -42,3 +64,4 @@ for j in ${safe_modes} ; do
         ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$outfile.nc ${TESTOUTDIR}/$outfile.bb.nc
     fi
 done
+done
diff --git a/test/largefile/Makefile.am b/test/largefile/Makefile.am
index d74134b76..f5a014332 100644
--- a/test/largefile/Makefile.am
+++ b/test/largefile/Makefile.am
@@ -90,7 +90,7 @@ ptest ptest4: $(check_PROGRAMS)
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 || exit 1
 
 ptests: ptest4
 ptest2 ptest6 ptest8 ptest10:
diff --git a/test/nc4/parallel_run.sh b/test/nc4/parallel_run.sh
index 6e0dfa371..85dc275bd 100755
--- a/test/nc4/parallel_run.sh
+++ b/test/nc4/parallel_run.sh
@@ -15,7 +15,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "check_PROGRAMS=${check_PROGRAMS}"
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -26,20 +26,15 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
            export PNETCDF_HINTS="romio_no_indep_rw=true"
         else
            export PNETCDF_HINTS=
         fi
-        if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
-        fi
         export PNETCDF_SAFE_MODE=$j
         # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
         ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc
     done
-    done
     rm -f ${OUTDIR}/$i.nc
     rm -f ${OUTDIR}/$i.nc.cdf4
 done
diff --git a/test/nc4/wrap_runs.sh b/test/nc4/wrap_runs.sh
index 10d76802b..885d31b70 100755
--- a/test/nc4/wrap_runs.sh
+++ b/test/nc4/wrap_runs.sh
@@ -23,10 +23,33 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${TESTSEQRUN} $1 ${TESTOUTDIR}/$outfile.nc
 done
+done
 
 rm -f ${OUTDIR}/$outfile.nc
 rm -f ${OUTDIR}/$outfile.nc.cdf4
diff --git a/test/nc_test/t_nc.c b/test/nc_test/t_nc.c
index c15b8d52d..1f1fb0873 100644
--- a/test/nc_test/t_nc.c
+++ b/test/nc_test/t_nc.c
@@ -112,32 +112,32 @@ static MPI_Offset sizes[] = { NC_UNLIMITED, SIZE_1 , SIZE_2 };
 static const char * const dim_names[] = { "record", "ixx", "iyy"};
 
 static int
-createtestdims(int cdfid, size_t num_dims, const MPI_Offset *sizes, const char * const dim_names[])
+createtestdims(int cdfid, size_t ndims, const MPI_Offset *dim_sizes, const char * const names[])
 {
 	int dimid, err;
-	while(num_dims-- != 0)
+	while(ndims-- != 0)
 	{
-		err = ncmpi_def_dim(cdfid, *dim_names++, *sizes, &dimid); ERR
-		sizes++;
+		err = ncmpi_def_dim(cdfid, *names++, *dim_sizes, &dimid); ERR
+		dim_sizes++;
 	}
 	return 0;
 }
 
 
 static int
-testdims(int cdfid, size_t num_dims, MPI_Offset *sizes, const char * const dim_names[])
+testdims(int cdfid, size_t ndims, MPI_Offset *dim_sizes, const char * const names[])
 {
 	int ii, err;
 	MPI_Offset size;
 	char cp[NC_MAX_NAME];
-	for(ii=0; (size_t) ii < num_dims; ii++, sizes++)
+	for(ii=0; (size_t) ii < ndims; ii++, dim_sizes++)
 	{
 		err = ncmpi_inq_dim(cdfid, ii, cp, &size); ERR
-		if( size != *sizes)
+		if( size != *dim_sizes)
 			(void) fprintf(stderr, "%d: %lu != %lu\n",
-				ii, (unsigned long)size, (unsigned long)*sizes);
-		if ( size != *sizes) return 1;
-		if ( strcmp(cp, *dim_names++) != 0) return 1;
+				ii, (unsigned long)size, (unsigned long)*dim_sizes);
+		if ( size != *dim_sizes) return 1;
+		if ( strcmp(cp, *names++) != 0) return 1;
 	}
 	return 0;
 }
@@ -195,11 +195,11 @@ static struct tcdfvar {
 #define	NUM_TESTVARS	6
 
 static int
-createtestvars(int id, const struct tcdfvar *testvars, size_t count)
+createtestvars(int id, const struct tcdfvar *vars, size_t count)
 {
 	int ii, err;
 	int varid;
-	const struct tcdfvar *vp = testvars;
+	const struct tcdfvar *vp = vars;
 
 	for(ii = 0; (size_t) ii < count; ii++, vp++ )
 	{
diff --git a/test/nc_test/test_iput.m4 b/test/nc_test/test_iput.m4
index d9f0da58c..ac0e18804 100644
--- a/test/nc_test/test_iput.m4
+++ b/test/nc_test/test_iput.m4
@@ -76,12 +76,12 @@ define(`CheckRange3',
 #include "tests.h"
 
 static double
-hash2nc(const nc_type var_type, int var_rank, MPI_Offset *index)
+hash2nc(const nc_type xtype, int v_rank, MPI_Offset *index)
 {
     double min;
     double max;
 
-    switch (var_type) {
+    switch (xtype) {
         /* no type conversion will happen for NC_CHAR, use in-memory limits */
         case NC_CHAR:   min = CHAR_MIN;     max = (double)CHAR_MAX;     break;
         case NC_BYTE:   min = X_BYTE_MIN;   max = (double)X_BYTE_MAX;   break;
@@ -98,16 +98,16 @@ hash2nc(const nc_type var_type, int var_rank, MPI_Offset *index)
             return NC_EBADTYPE;
     }
 
-    return MAX(min, MIN(max, hash(var_type, var_rank, index)));
+    return MAX(min, MIN(max, hash(xtype, v_rank, index)));
 }
 
 static int
-dbls2ncs(size_t nels, int var_type, double *inBuf, void *outBuf)
+dbls2ncs(size_t nels, int xtype, double *inBuf, void *outBuf)
 {
     size_t i;
     char *p = (char*)outBuf;
     for (i=0; i<nels; i++) {
-        switch (var_type) {
+        switch (xtype) {
             case NC_CHAR:   {char      a = inBuf[i]; memcpy(p, &a, 1); break;}
             case NC_BYTE:   {schar     a = inBuf[i]; memcpy(p, &a, 1); break;}
             case NC_SHORT:  {short     a = inBuf[i]; memcpy(p, &a, 2); break;}
@@ -122,7 +122,7 @@ dbls2ncs(size_t nels, int var_type, double *inBuf, void *outBuf)
             default:
                 return NC_EBADTYPE;
         }
-        p += sizeof_nctype(var_type);
+        p += sizeof_nctype(xtype);
     }
     return NC_NOERR;
 }
diff --git a/test/nc_test/test_read.m4 b/test/nc_test/test_read.m4
index 69da0c0a4..ed8589eec 100644
--- a/test/nc_test/test_read.m4
+++ b/test/nc_test/test_read.m4
@@ -111,7 +111,7 @@ TestFunc(strerror)(void)
 
     /* Try on each legitimate error status */
     for (i=0; i<LEN_OF(ncerrs); i++) {
-        const char *message = APIFunc(strerror)(ncerrs[i].status);
+        message = APIFunc(strerror)(ncerrs[i].status);
         IF (strcmp(message, ncerrs[i].msg) != 0)
             error("APIFunc(strerror)(%d) should return \"%s\", not \"%s\"",
                   ncerrs[i].status, ncerrs[i].msg, message);
diff --git a/test/nc_test/test_write.m4 b/test/nc_test/test_write.m4
index 1ed67671a..1e288a4fa 100644
--- a/test/nc_test/test_write.m4
+++ b/test/nc_test/test_write.m4
@@ -2113,7 +2113,7 @@ TestFunc(rename_att)(AttVarArgs)
     nc_type atttype;
     IntType length;
     IntType attlength;
-    char  text[MAX_NELS];
+    char  txt_buf[MAX_NELS];
     double value[MAX_NELS];
     double expect;
 
@@ -2185,13 +2185,13 @@ TestFunc(rename_att)(AttVarArgs)
             IF (length != attlength)
                 error("inq_att: unexpected length");
             if (datatype == NC_CHAR) {
-                err = APIFunc(get_att_text)(ncid, varid, name, text);
+                err = APIFunc(get_att_text)(ncid, varid, name, txt_buf);
                 IF (err != NC_NOERR)
                     error("get_att_text: %s", APIFunc(strerror)(err));
                 for (k = 0; k < attlength; k++) {
                     ndx[0] = k;
                     expect = hash(datatype, -1, ndx);
-                    IF (text[k] != (char)expect)
+                    IF (txt_buf[k] != (char)expect)
                         error("get_att_text: unexpected value");
                 }
             } else {
@@ -2386,7 +2386,7 @@ TestFunc(set_fill)(VarArgs)
     IntType j;
     int old_fillmode;
     int nok = 0;      /* count of valid comparisons */
-    char text = 0;
+    char txt_buf = 0;
     double value = 0;
     double fill;
     IntType index[MAX_RANK];
@@ -2461,7 +2461,7 @@ ifdef(`PNETCDF', `
     for (i=0; i<=index[0]; i++)
         err = APIFunc(fill_var_rec)(ncid, varid, i);')dnl
 
-    err = PutVar1TYPE(text)(ncid, varid, index, &text);
+    err = PutVar1TYPE(text)(ncid, varid, index, &txt_buf);
     IF (err != NC_NOERR)
         error("put_var1_text_all: %s", APIFunc(strerror)(err));
 
@@ -2486,10 +2486,10 @@ ifdef(`PNETCDF', `
             err = toMixedBase(j, var_rank[i], var_shape[i], index);
             IF (err != 0) error("error in toMixedBase");
             if (var_type[i] == NC_CHAR) {
-                err = GetVar1TYPE(text)(ncid, i, index, &text);
+                err = GetVar1TYPE(text)(ncid, i, index, &txt_buf);
                 IF (err != NC_NOERR)
                     error("get_var1_text_all failed: %s", APIFunc(strerror)(err));
-                value = text;
+                value = txt_buf;
             } else {
                 err = GetVar1TYPE(double)(ncid, i, index, &value);
                 IF (err != NC_NOERR)
@@ -2524,11 +2524,11 @@ ifdef(`PNETCDF', `
 
     /* set _FillValue = 42 for all vars */
     fill = 42;
-    text = 42;
+    txt_buf = 42;
     for (i = 0; i < numVars; i++) {
 #ifndef ENABLE_NETCDF4
         if (var_type[i] == NC_CHAR) {
-            err = APIFunc(put_att_text)(ncid, i, "_FillValue", 1, &text);
+            err = APIFunc(put_att_text)(ncid, i, "_FillValue", 1, &txt_buf);
             IF (err != NC_NOERR)
                 error("put_att_text: %s", APIFunc(strerror)(err));
         } else {
@@ -2558,7 +2558,7 @@ ifdef(`PNETCDF', `
         IF (err != NC_NOERR)
             error("def_var_fill: %s", APIFunc(strerror)(err));
         switch (var_type[i]) {
-            case NC_CHAR:   err = APIFunc(put_att_text)     (ncid, i, "_FillValue", 1, &text);
+            case NC_CHAR:   err = APIFunc(put_att_text)     (ncid, i, "_FillValue", 1, &txt_buf);
                             break;
             case NC_BYTE:   err = APIFunc(put_att_schar)    (ncid,i,"_FillValue",var_type[i],1,&fill_sc);
                             break;
@@ -2605,7 +2605,7 @@ ifdef(`PNETCDF', `
     for (i=0; i<=index[0]; i++)
         err = APIFunc(fill_var_rec)(ncid, varid, i);')dnl
 
-    err = PutVar1TYPE(text)(ncid, varid, index, &text);
+    err = PutVar1TYPE(text)(ncid, varid, index, &txt_buf);
     IF (err != NC_NOERR)
         error("put_var1_text_all: %s", APIFunc(strerror)(err));
 
@@ -2616,10 +2616,10 @@ ifdef(`PNETCDF', `
             err = toMixedBase(j, var_rank[i], var_shape[i], index);
             IF (err != 0) error("error in toMixedBase");
             if (var_type[i] == NC_CHAR) {
-                err = GetVar1TYPE(text)(ncid, i, index, &text);
+                err = GetVar1TYPE(text)(ncid, i, index, &txt_buf);
                 IF (err != NC_NOERR)
                     error("get_var1_text_all failed: %s", APIFunc(strerror)(err));
-                value = text;
+                value = txt_buf;
             } else {
                 err = GetVar1TYPE(double)(ncid, i, index, &value);
                 IF (err != NC_NOERR)
@@ -2666,7 +2666,7 @@ ifdef(`PNETCDF', `
     /* NetCDF-4 may return NC_ELATEDEF instead of NC_ELATEFILL */
     for (i = 0; i < numVars; i++) {
         if (var_type[i] == NC_CHAR) {
-            err = APIFunc(put_att_text)(ncid, i, "_FillValue", 1, &text);
+            err = APIFunc(put_att_text)(ncid, i, "_FillValue", 1, &txt_buf);
             if (format == NC_FORMAT_NETCDF4 || format == NC_FORMAT_NETCDF4_CLASSIC) {
 #ifdef NETCDF_GE_4_5_0
                 IF (err != NC_ELATEFILL && err != NC_ELATEDEF)
diff --git a/test/nc_test/tst_atts3.c b/test/nc_test/tst_atts3.c
index c7d76e931..cf4e8f759 100644
--- a/test/nc_test/tst_atts3.c
+++ b/test/nc_test/tst_atts3.c
@@ -135,21 +135,6 @@ tst_atts3(char *filename, int cmode)
 {
     char filename2[256];
     int err, nerrs=0;
-    signed char schar_in[ATT_LEN], schar_out[ATT_LEN] = {NC_MIN_BYTE, 1, NC_MAX_BYTE};
-    unsigned char uchar_in[ATT_LEN];
-    short short_in[ATT_LEN], short_out[ATT_LEN] = {NC_MIN_SHORT, -128, NC_MAX_SHORT};
-    int int_in[ATT_LEN], int_out[ATT_LEN] = {-100000, 127, 100000};
-    float float_in[ATT_LEN], float_out[ATT_LEN] = {-0.5, 0.25, 0.125};
-    double double_in[ATT_LEN], double_out[ATT_LEN] = {-0.25, .5, 0.125};
-    long long longlong_in[ATT_LEN] = {-1LL, -1LL, -1LL};
-#ifdef USE_NETCDF4
-    long long_in[ATT_LEN];
-    unsigned short ushort_in[ATT_LEN], ushort_out[ATT_LEN] = {0, 128, NC_MAX_USHORT};
-    unsigned int uint_in[ATT_LEN], uint_out[ATT_LEN] = {0, 128, NC_MAX_UINT};
-    long long longlong_out[ATT_LEN] = {-3123456789LL, 128LL, 3123456789LL};
-    unsigned long long ulonglong_in[ATT_LEN] = {NC_MAX_UINT64, NC_MAX_UINT64, NC_MAX_UINT64};
-    unsigned long long ulonglong_out[ATT_LEN] = {0LL, 128LL, 3123456789LL};
-#endif
 
     (void) signal(SIGFPE, SIG_IGN);
 
@@ -186,12 +171,18 @@ tst_atts3(char *filename, int cmode)
    if (verbose) printf("ok\n");
    if (verbose) printf("*** testing simple global atts...");
    {
-      int ncid;
+      char *speech_in;
+      int i, ncid;
       nc_type att_type;
       MPI_Offset att_len;
-      int i;
 
-      char *speech_in;
+      unsigned char uchar_in[ATT_LEN];
+      signed char schar_in[ATT_LEN], schar_out[ATT_LEN] = {NC_MIN_BYTE, 1, NC_MAX_BYTE};
+      int int_in[ATT_LEN], int_out[ATT_LEN] = {-100000, 127, 100000};
+      short short_in[ATT_LEN], short_out[ATT_LEN] = {NC_MIN_SHORT, -128, NC_MAX_SHORT};
+      float float_in[ATT_LEN], float_out[ATT_LEN] = {-0.5, 0.25, 0.125};
+      double double_in[ATT_LEN], double_out[ATT_LEN] = {-0.25, .5, 0.125};
+      long long longlong_in[ATT_LEN] = {-1LL, -1LL, -1LL};
 
       /* This won't work, because classic files can't create these types. */
       err=ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL,&ncid); ERR
@@ -254,8 +245,14 @@ tst_atts3(char *filename, int cmode)
    if (verbose) printf("*** testing attribute data type conversions...");
 
    {
-      int ncid;
-      int i;
+      int i, ncid;
+      unsigned char uchar_in[ATT_LEN];
+      signed char schar_in[ATT_LEN], schar_out[ATT_LEN] = {NC_MIN_BYTE, 1, NC_MAX_BYTE};
+      short short_in[ATT_LEN], short_out[ATT_LEN] = {NC_MIN_SHORT, -128, NC_MAX_SHORT};
+      int int_in[ATT_LEN], int_out[ATT_LEN] = {-100000, 127, 100000};
+      float float_in[ATT_LEN], float_out[ATT_LEN] = {-0.5, 0.25, 0.125};
+      double double_in[ATT_LEN], double_out[ATT_LEN] = {-0.25, .5, 0.125};
+      long long longlong_in[ATT_LEN] = {-1LL, -1LL, -1LL};
 
       /* Reopen the file and try different type conversions. */
       err=ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); ERR
@@ -410,8 +407,6 @@ tst_atts3(char *filename, int cmode)
    {
       int ncid;
 
-      /*int int_in[ATT_LEN], int_out[ATT_LEN] = {NC_MIN_INT, 128, NC_MAX_INT};*/
-
       /* Create a file with a global attribute of each type of zero length. */
       err=ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL,&ncid); ERR
       err=ncmpi_put_att_text(ncid, NC_GLOBAL, ATT_TEXT_NAME, 0, NULL); ERR
@@ -428,7 +423,6 @@ tst_atts3(char *filename, int cmode)
       int ncid;
       signed char schar_in[ATT_LEN];
       short short_in[ATT_LEN];
-      /*int int_in[ATT_LEN], int_out[ATT_LEN] = {NC_MIN_INT, 128, NC_MAX_INT};*/
       int int_in[ATT_LEN];
       float float_in[ATT_LEN];
       double double_in[ATT_LEN];
diff --git a/test/nc_test/tst_names.c b/test/nc_test/tst_names.c
index cc81615cd..db49d4cbe 100644
--- a/test/nc_test/tst_names.c
+++ b/test/nc_test/tst_names.c
@@ -75,11 +75,11 @@ main(int argc, char **argv)
        "has ascii_del_\x7f_in name",
        /* Invalid UTF-8 of various sorts, thanks to Markus Kuhn */
        "\xA0\xB0\xC0\xD0",
-       "xyz\x80", 		/* unexpected continuation bytes */
+       "xyz\x80",                 /* unexpected continuation bytes */
        "\x80xyz",
        "xyz\xBF",
        "\xBFxyz",
-       "\xC0xyz",		/* lonely start characters */
+       "\xC0xyz",                /* lonely start characters */
        "x\xC0yz",
        "xy\xC0z",
        "xyz\xC0",
@@ -125,7 +125,7 @@ main(int argc, char **argv)
        "x\xFDyz",
        "xy\xFDz",
        "xyz\xFD",
-       "\xC0\xC0xy",		/* last continuation byte missing */
+       "\xC0\xC0xy",                /* last continuation byte missing */
        "x\xC0\xC0y",
        "xy\xC0\xC0",
        "\xDF\xDFxy",
@@ -149,7 +149,7 @@ main(int argc, char **argv)
        "x\xFC\x80\x80\x80\x80",
        "\xFD\x80\x80\x80\x80x",
        "x\xFD\x80\x80\x80\x80",
-       "\xFExyz",		/* impossible bytes */
+       "\xFExyz",                /* impossible bytes */
        "x\xFEyz",
        "xy\xFEz",
        "xyz\xFE",
@@ -157,7 +157,7 @@ main(int argc, char **argv)
        "x\xFFyz",
        "xy\xFFz",
        "xyz\xFF",
-       "\xC0\xAFxy",		/* overlong sequences */
+       "\xC0\xAFxy",                /* overlong sequences */
        "x\xC0\xAFy",
        "xy\xC0\xAF",
        "\xE0\x80\xAFx",
@@ -179,7 +179,7 @@ main(int argc, char **argv)
        "x\xF8\x87\xBF\xBF\xBF",
        "\xFC\x83\xBF\xBF\xBF\xBFx",
        "x\xFC\x83\xBF\xBF\xBF\xBF",
-       "x\xC0\x80",		/* overlong NULs */
+       "x\xC0\x80",                /* overlong NULs */
        "x\xE0\x80\x80",
        "x\xF0\x80\x80\x80",
        "x\xF8\x80\x80\x80\x80",
@@ -202,7 +202,7 @@ main(int argc, char **argv)
        "x\xED\xAF\xBF\xED\xBF\xBF"
 #if 0
        /* The two below is legal since UTF8PROC_VERSION_MAJOR 2 */
-       "x\xEF\xBF\xBE",		/* other illegal code positions */
+       "x\xEF\xBF\xBE",                /* other illegal code positions */
        "x\xEF\xBF\xBF"
 #endif
    };
@@ -252,63 +252,66 @@ main(int argc, char **argv)
        printf("*** switching to netCDF %s format...", format_names[j]);
 #endif
        if((res = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER|cmode[j], MPI_INFO_NULL, &ncid)))
-	   ERROR
+           ERROR
 
        /* Define dimensions, variables, and attributes with various
-	* acceptable names */
+        * acceptable names */
        for (i = 0; i < NUM_GOOD; i++) {
-	   if ((res = ncmpi_def_dim(ncid, valid[i], DIMLEN, &dimid)))
-	       ERRORI
+           if ((res = ncmpi_def_dim(ncid, valid[i], DIMLEN, &dimid)))
+               ERRORI
+
+           dimids[i] = dimid;
+           /* Define variable with same name */
+           if ((res = ncmpi_def_var(ncid, valid[i], NC_FLOAT, NDIMS, &dimids[i], &varid)))
+               ERRORI
+           if ((res = ncmpi_def_var_fill(ncid, varid, 0, NULL)))
+               ERRORI
 
-	   dimids[i] = dimid;
-	   /* Define variable with same name */
-	   if ((res = ncmpi_def_var(ncid, valid[i], NC_FLOAT, NDIMS, &dimids[i], &varid)))
-	       ERRORI
-	   varids[i] = varid;
-	   /* Define variable and global attributes with same name and value */
-	   if ((res = ncmpi_put_att_text(ncid, varid, valid[i], strlen(valid[i]), valid[i])))
-	       ERRORI
-	   if ((res = ncmpi_put_att_double(ncid, NC_GLOBAL, valid[i], NC_DOUBLE, NATTVALS, attvals)))
-	       ERRORI
+           varids[i] = varid;
+           /* Define variable and global attributes with same name and value */
+           if ((res = ncmpi_put_att_text(ncid, varid, valid[i], strlen(valid[i]), valid[i])))
+               ERRORI
+           if ((res = ncmpi_put_att_double(ncid, NC_GLOBAL, valid[i], NC_DOUBLE, NATTVALS, attvals)))
+               ERRORI
        }
 
        /* Try defining dimensions, variables, and attributes with various
-	* bad names and make sure these are rejected */
+        * bad names and make sure these are rejected */
        for (i = 0; i < NUM_BAD; i++) {
-	   if ((res = ncmpi_def_dim(ncid, notvalid[i], DIMLEN, &dimid)) != NC_EBADNAME)
+           if ((res = ncmpi_def_dim(ncid, notvalid[i], DIMLEN, &dimid)) != NC_EBADNAME)
                ERRORI
-	   if ((res = ncmpi_def_var(ncid, notvalid[i], NC_FLOAT, NDIMS, dimids, &varid)) != NC_EBADNAME)
+           if ((res = ncmpi_def_var(ncid, notvalid[i], NC_FLOAT, NDIMS, dimids, &varid)) != NC_EBADNAME)
                ERRORI
-	   if ((res = ncmpi_put_att_text(ncid, varid, notvalid[i], strlen(attstring), attstring)) != NC_EBADNAME)
+           if ((res = ncmpi_put_att_text(ncid, varid, notvalid[i], strlen(attstring), attstring)) != NC_EBADNAME)
                ERRORI
-	   if ((res = ncmpi_put_att_double(ncid, NC_GLOBAL, notvalid[i], NC_DOUBLE, NATTVALS, attvals)) != NC_EBADNAME)
+           if ((res = ncmpi_put_att_double(ncid, NC_GLOBAL, notvalid[i], NC_DOUBLE, NATTVALS, attvals)) != NC_EBADNAME)
                ERRORI
        }
        if ((res = ncmpi_enddef(ncid)))
-	   ERROR
+           ERROR
        if ((res = ncmpi_close(ncid)))
-	   ERROR
+           ERROR
 
        /* Check it out, make sure all objects with good names were defined OK */
        if ((res = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid)))
-	   ERROR
+           ERROR
        for (i = 0; i < NUM_GOOD; i++) {
-	   MPI_Offset attlen;
-	   if ((res = ncmpi_inq_dimid(ncid, valid[i], &dimid)) || dimid != dimids[i])
-	       ERRORI
-	   if ((res = ncmpi_inq_varid(ncid, valid[i], &varid)) || varid != varids[i])
-	       ERRORI
-	   res = ncmpi_inq_attlen(ncid, varid, valid[i], &attlen);
-	   if ((res = ncmpi_get_att_text(ncid, varid, valid[i], attstr_in)))
-	       ERRORI
-	   attstr_in[attlen] = '\0';
-	   if (strcmp(valid[i], attstr_in) != 0)
-	       ERRORI
-	   if ((res = ncmpi_get_att_double(ncid, NC_GLOBAL, valid[i], attvals_in)) || attvals[0] != attvals_in[0])
-	       ERRORI
+           MPI_Offset attlen;
+           if ((res = ncmpi_inq_dimid(ncid, valid[i], &dimid)) || dimid != dimids[i])
+               ERRORI
+           if ((res = ncmpi_inq_varid(ncid, valid[i], &varid)) || varid != varids[i])
+               ERRORI
+           res = ncmpi_inq_attlen(ncid, varid, valid[i], &attlen);
+           if ((res = ncmpi_get_att_text(ncid, varid, valid[i], attstr_in)))
+               ERRORI
+           attstr_in[attlen] = '\0';
+           if (strcmp(valid[i], attstr_in) != 0)
+               ERRORI
+           if ((res = ncmpi_get_att_double(ncid, NC_GLOBAL, valid[i], attvals_in)) || attvals[0] != attvals_in[0])
+               ERRORI
        }
        if ((res = ncmpi_close(ncid)))
-	   ERROR
+           ERROR
    }
 
     /* check if PnetCDF freed all internal malloc */
diff --git a/test/nc_test/util.c b/test/nc_test/util.c
index 71345fb89..a43c2881e 100644
--- a/test/nc_test/util.c
+++ b/test/nc_test/util.c
@@ -809,7 +809,7 @@ put_vars(int ncid, int numVars)
     MPI_Offset index[MAX_RANK];
     int  i, j, err, allInRange;
     double value[MAX_NELS];
-    char text[MAX_NELS];
+    char txt_buf[MAX_NELS];
     int bb_enabled=0;
     {
         int flag;
@@ -832,14 +832,14 @@ put_vars(int ncid, int numVars)
             IF (err != NC_NOERR) error("toMixedBase");
             if (var_name[i][0] == 'c') { /* var_type[i] is NC_CHAR */
                 assert(var_type[i] == NC_CHAR);
-                text[j] = hash(var_type[i], var_rank[i], index);
+                txt_buf[j] = hash(var_type[i], var_rank[i], index);
             } else {
                 value[j] = hash(var_type[i], var_rank[i], index);
                 allInRange = allInRange && inRange(value[j], var_type[i]);
             }
         }
         if (var_name[i][0] == 'c') {
-            err = ncmpi_put_vara_text_all(ncid, i, start, var_shape[i], text);
+            err = ncmpi_put_vara_text_all(ncid, i, start, var_shape[i], txt_buf);
             IF (err != NC_NOERR)
                 error("ncmpi_put_vara_text_all: %s", ncmpi_strerror(err));
         } else {
@@ -929,7 +929,7 @@ int
 check_vars(int ncid, int numVars)
 {
     MPI_Offset index[MAX_RANK];
-    char text, name[NC_MAX_NAME];
+    char txt_buf, name[NC_MAX_NAME];
     int  i, j, err;
     int  nok = 0;      /* count of valid comparisons */
     int  isChar, ndims, dimids[MAX_RANK];
@@ -961,12 +961,12 @@ check_vars(int ncid, int numVars)
                 error("error in toMixedBase 2");
             expect = hash( var_type[i], var_rank[i], index );
             if (isChar) {
-                err = ncmpi_get_var1_text_all(ncid, i, index, &text);
+                err = ncmpi_get_var1_text_all(ncid, i, index, &txt_buf);
                 IF (err != NC_NOERR)
                     error("ncmpi_get_var1_text_all: %s", ncmpi_strerror(err));
-                IF (text != (char)expect) {
+                IF (txt_buf != (char)expect) {
                     error("Var %s (varid=%d) value[%d] read %d not that expected %d ",
-                          var_name[i], i, j, text, (char)expect);
+                          var_name[i], i, j, txt_buf, (char)expect);
                     print_n_size_t(var_rank[i], index);
                 } else {
                     nok++;
@@ -999,7 +999,7 @@ check_vars(int ncid, int numVars)
 void
 check_atts(int ncid, int numGatts, int numVars)
 {
-    char name[NC_MAX_NAME], text[MAX_NELS];
+    char name[NC_MAX_NAME], txt_buf[MAX_NELS];
     int  i, j, err;        /* status */
     nc_type xtype;
     MPI_Offset k, length, ndx[1];
@@ -1021,13 +1021,13 @@ check_atts(int ncid, int numGatts, int numVars)
             IF (length != ATT_LEN(i,j))
                 error("ncmpi_inq_att: unexpected length");
             if (xtype == NC_CHAR) {
-                err = ncmpi_get_att_text(ncid, i, name, text);
+                err = ncmpi_get_att_text(ncid, i, name, txt_buf);
                 IF (err != NC_NOERR)
                     error("ncmpi_get_att_text: %s", ncmpi_strerror(err));
                 for (k = 0; k < ATT_LEN(i,j); k++) {
                     ndx[0] = k;
                     expect = hash(xtype, -1, ndx);
-                    if (text[k] != (char)expect) {
+                    if (txt_buf[k] != (char)expect) {
                         error("ncmpi_get_att_text: unexpected value");
                     } else {
                         nok++;
diff --git a/test/nc_test/wrap_runs.sh b/test/nc_test/wrap_runs.sh
index 343accd9f..39f37e955 100755
--- a/test/nc_test/wrap_runs.sh
+++ b/test/nc_test/wrap_runs.sh
@@ -26,8 +26,29 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
 
     ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.nc
     ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc
@@ -37,7 +58,7 @@ for j in ${safe_modes} ; do
        echo ""
        echo "---- testing burst buffering"
 
-       export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+       export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
        ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.bb.nc
        unset PNETCDF_HINTS
        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc
@@ -50,4 +71,5 @@ for j in ${safe_modes} ; do
     rm -f ${OUTDIR}/$outfile.nc.2
     rm -f ${OUTDIR}/$outfile.bb.nc.2
 done
+done
 
diff --git a/test/nonblocking/Makefile.am b/test/nonblocking/Makefile.am
index 568154932..43a4fa82e 100644
--- a/test/nonblocking/Makefile.am
+++ b/test/nonblocking/Makefile.am
@@ -122,7 +122,7 @@ CLEANFILES = $(M4_SRCS:.m4=.c) core core.* *.gcda *.gcno *.gcov gmon.out \
              $(TESTOUTDIR)/testfile*.nc $(NC_FILES) \
              $(TESTOUTDIR)/mcoll_perf.nc.* $(TESTOUTDIR)/mcoll_perf.bb.nc.*
 
-EXTRA_DIST = $(M4_SRCS) seq_runs.sh wrap_runs.sh parallel_run.sh
+EXTRA_DIST = $(M4_SRCS) seq_runs.sh wrap_runs.sh
 
 ../common/libtestutils.la:
 	set -e; cd ../common && $(MAKE) $(MFLAGS) tests
@@ -137,21 +137,21 @@ ptest ptest4: $(check_PROGRAMS)
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 || exit 1
 
 ptest2 : $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 2 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 2 || exit 1
+	$(srcdir)/../parallel_run.sh 2 || exit 1
 
 ptest6 : $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 6 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 6 || exit 1
+	$(srcdir)/../parallel_run.sh 6 || exit 1
 
 ptests: ptest2 ptest4 ptest6
 ptest8 ptest10:
diff --git a/test/nonblocking/column_wise.m4 b/test/nonblocking/column_wise.m4
index 9bc15958c..c5c354c43 100644
--- a/test/nonblocking/column_wise.m4
+++ b/test/nonblocking/column_wise.m4
@@ -166,6 +166,7 @@ int test_column_wise_$1(char *filename, int cdf)
                 printf("Error at line %d in %s: put buffer altered buffer[%d][%d]=IFMT($1)\n",
                        __LINE__,__FILE__,i,j,buf[i][j]);
                 nerrs++;
+                i = myNX; break;
             }
         }
     }
@@ -195,6 +196,7 @@ int test_column_wise_$1(char *filename, int cdf)
                 printf("Error at line %d in %s: put buffer altered buffer[%d][%d]=IFMT($1)\n",
                        __LINE__,__FILE__,i,j,buf[i][j]);
                 nerrs++;
+                i = myNX; break;
             }
         }
     }
@@ -204,11 +206,13 @@ int test_column_wise_$1(char *filename, int cdf)
         if (reqs[i] != NC_REQ_NULL) { /* add in PnetCDF v1.7.0 */
             printf("Error at line %d in %s: request ID %d fails to be set to NC_REQ_NULL\n",__LINE__,__FILE__,i);
             nerrs++;
+            break;
         }
         if (sts[i] != NC_NOERR) {
             printf("Error at line %d in %s: nonblocking write fails on request %d (%s)\n",
                    __LINE__,__FILE__,i, ncmpi_strerror(sts[i]));
             nerrs++;
+            break;
         }
     }
 
@@ -252,6 +256,7 @@ int test_column_wise_$1(char *filename, int cdf)
             printf("Error at line %d in %s: nonblocking write fails on request %d (%s)\n",
                    __LINE__,__FILE__,i, ncmpi_strerror(sts[i]));
             nerrs++;
+            break;
         }
 
     for (i=0; i<myNX; i++) {
@@ -261,8 +266,7 @@ int test_column_wise_$1(char *filename, int cdf)
                 printf("Error at line %d in %s: expect buf[%d][%d]=IFMT($1) but got IFMT($1)\n",
                        __LINE__,__FILE__,i,j,expected,buf[i][j]);
                 nerrs++;
-                i = myNX;
-                break;
+                i = myNX; break;
             }
         }
     }
diff --git a/test/nonblocking/i_varn_indef.c b/test/nonblocking/i_varn_indef.c
index 9617d99ad..174c768cb 100644
--- a/test/nonblocking/i_varn_indef.c
+++ b/test/nonblocking/i_varn_indef.c
@@ -81,23 +81,6 @@
     } \
 }
 
-static
-int clear_file_contents(int ncid, int *varid)
-{
-    int i, err, nerrs=0, rank;
-    long long *w_buffer = (long long*) malloc(sizeof(long long) * NY*NX);
-    for (i=0; i<NY*NX; i++) w_buffer[i] = -1;
-
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-
-    for (i=0; i<4; i++) {
-        err = ncmpi_put_var_longlong_all(ncid, varid[i], w_buffer);
-        CHECK_ERR
-    }
-    free(w_buffer);
-    return nerrs;
-}
-
 static
 int check_contents_for_fail(int ncid, int *varid, int lineno)
 {
@@ -178,7 +161,6 @@ int main(int argc, char** argv)
     long long *buffer[4], *cbuffer[4], *rbuffer[4];
     int num_segs[4] = {4, 6, 5, 4};
     int req_lens[4], my_nsegs[4];
-    int bb_enabled=0;
 
     MPI_Datatype buftype[4];
     MPI_Offset **starts[4], **counts[4];
@@ -302,18 +284,6 @@ int main(int argc, char** argv)
     err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid);
     CHECK_ERR
 
-    {
-        int flag;
-        char hint[MPI_MAX_INFO_VAL];
-        MPI_Info infoused;
-
-        ncmpi_inq_file_info(ncid, &infoused);
-        MPI_Info_get(infoused, "nc_burst_buf", MPI_MAX_INFO_VAL - 1, hint, &flag);
-        if (flag && strcasecmp(hint, "enable") == 0)
-            bb_enabled = 1;
-        MPI_Info_free(&infoused);
-    }
-
     /* create a global array of size NY * NX */
     err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]); CHECK_ERR
     err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]); CHECK_ERR
@@ -337,19 +307,9 @@ int main(int argc, char** argv)
         nerrs++;
     }
 
-    err = ncmpi_enddef(ncid); CHECK_ERR
-
-#ifdef STRONGER_CONSISTENCY
-    ncmpi_sync(ncid);
-    MPI_Barrier(MPI_COMM_WORLD);
-    ncmpi_sync(ncid);
-#endif
+    err = ncmpi_set_fill(ncid, NC_FILL, NULL); CHECK_ERR
 
-    /* clear the file contents using a blocking API, before commit the
-     * nonblocking requests posted in define mode */
-    if (!bb_enabled) {
-        nerrs += clear_file_contents(ncid, varid);
-    }
+    err = ncmpi_enddef(ncid); CHECK_ERR
 
 #ifdef STRONGER_CONSISTENCY
     ncmpi_sync(ncid);
@@ -358,6 +318,7 @@ int main(int argc, char** argv)
 #endif
 
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+
     err = ncmpi_wait_all(ncid, nreqs, reqs, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -403,19 +364,10 @@ int main(int argc, char** argv)
                                        counts[i], cbuffer[i], &reqs[i]);
         CHECK_ERR
     }
-    err = ncmpi_enddef(ncid); CHECK_ERR
 
-#ifdef STRONGER_CONSISTENCY
-    ncmpi_sync(ncid);
-    MPI_Barrier(MPI_COMM_WORLD);
-    ncmpi_sync(ncid);
-#endif
+    err = ncmpi_set_fill(ncid, NC_FILL, NULL); CHECK_ERR
 
-    /* clear the file contents using a blocking API, before commit the
-     * nonblocking requests posted in define mode */
-    if (!bb_enabled) {
-        nerrs += clear_file_contents(ncid, varid);
-    }
+    err = ncmpi_enddef(ncid); CHECK_ERR
 
 #ifdef STRONGER_CONSISTENCY
     ncmpi_sync(ncid);
@@ -424,6 +376,7 @@ int main(int argc, char** argv)
 #endif
 
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+
     err = ncmpi_wait_all(ncid, nreqs, reqs, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -479,19 +432,9 @@ int main(int argc, char** argv)
         CHECK_ERR
     }
 
-    err = ncmpi_enddef(ncid); CHECK_ERR
-
-#ifdef STRONGER_CONSISTENCY
-    ncmpi_sync(ncid);
-    MPI_Barrier(MPI_COMM_WORLD);
-    ncmpi_sync(ncid);
-#endif
+    err = ncmpi_set_fill(ncid, NC_FILL, NULL); CHECK_ERR
 
-    /* clear the file contents using a blocking API, before commit the
-     * nonblocking requests posted in define mode */
-    if (!bb_enabled) {
-        nerrs += clear_file_contents(ncid, varid);
-    }
+    err = ncmpi_enddef(ncid); CHECK_ERR
 
 #ifdef STRONGER_CONSISTENCY
     ncmpi_sync(ncid);
@@ -500,6 +443,7 @@ int main(int argc, char** argv)
 #endif
 
     nerrs += check_num_pending_reqs(ncid, nreqs*2, __LINE__);
+
     err = ncmpi_wait_all(ncid, nreqs, reqs, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -517,8 +461,10 @@ int main(int argc, char** argv)
 
     /* all processes read entire variables back and check contents */
     nerrs += check_contents_for_fail(ncid, varid, __LINE__);
+
     /* commit read requests */
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+
     err = ncmpi_wait_all(ncid, nreqs, reqs+4, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -589,19 +535,9 @@ int main(int argc, char** argv)
         CHECK_ERR
     }
 
-    err = ncmpi_enddef(ncid); CHECK_ERR
-
-#ifdef STRONGER_CONSISTENCY
-    ncmpi_sync(ncid);
-    MPI_Barrier(MPI_COMM_WORLD);
-    ncmpi_sync(ncid);
-#endif
+    err = ncmpi_set_fill(ncid, NC_FILL, NULL); CHECK_ERR
 
-    /* clear the file contents using a blocking API, before commit the
-     * nonblocking requests posted in define mode */
-    if (!bb_enabled) {
-        nerrs += clear_file_contents(ncid, varid);
-    }
+    err = ncmpi_enddef(ncid); CHECK_ERR
 
 #ifdef STRONGER_CONSISTENCY
     ncmpi_sync(ncid);
@@ -621,6 +557,7 @@ int main(int argc, char** argv)
 
     /* flush nonblocking 1st batch read requests */
     nerrs += check_num_pending_reqs(ncid, nreqs*2, __LINE__);
+
     err = ncmpi_wait_all(ncid, nreqs, reqs+4, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -642,6 +579,7 @@ int main(int argc, char** argv)
 
     /* flush nonblocking 2nd batch read requests */
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+
     err = ncmpi_wait_all(ncid, nreqs, reqs+8, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
diff --git a/test/nonblocking/i_varn_int64.c b/test/nonblocking/i_varn_int64.c
index 52700d8b1..c5cfd34cf 100644
--- a/test/nonblocking/i_varn_int64.c
+++ b/test/nonblocking/i_varn_int64.c
@@ -141,13 +141,13 @@ int clear_file_contents(int ncid, int *varid)
     err = ncmpi_wait_all(ncid, NC_REQ_ALL, NULL, NULL);
     CHECK_ERR
 
-    free(w_buffer);
-
     /* When using burst buffering, flush the log to prevent new value being
      * skipped due to overlaping domain
      */
     err = ncmpi_flush(ncid); CHECK_ERR
 
+    free(w_buffer);
+
     return nerrs;
 }
 
@@ -190,6 +190,7 @@ int check_contents_for_fail(int ncid, int *varid)
                 printf("Error at line %d in %s: xxpect read buf[%d][%d]=%lld, but got %lld\n",
                        __LINE__,__FILE__,i,j,expected[i][j],r_buffer[j]);
                 nerrs++;
+                i = 4; break;
             }
         }
     }
@@ -227,8 +228,8 @@ static int
 test_varn(int ncid, int rank, int *varid)
 {
     int i, j, k, err, nerrs=0, bufsize=0;
-    int nreqs, reqs[4], sts[4];
-    long long *buffer[4], *cbuffer[4];
+    int nreqs=0, reqs[4], sts[4];
+    long long *wbuf[4], *c_wbuf[4], *rbuf[4], *c_rbuf[4];
     int num_segs[4] = {4, 6, 5, 4};
     int req_lens[4], my_nsegs[4];
     MPI_Offset **starts[4], **counts[4];
@@ -263,6 +264,9 @@ test_varn(int ncid, int rank, int *varid)
               -  -  -  -  -  -  -  X  X  X
      */
 
+    c_wbuf[0] = NULL;
+    c_rbuf[0] = NULL;
+
     /* allocate space for starts and counts */
     starts[0] = (MPI_Offset**) malloc(sizeof(MPI_Offset*) * 4 * 6);
     counts[0] = (MPI_Offset**) malloc(sizeof(MPI_Offset*) * 4 * 6);
@@ -301,6 +305,7 @@ test_varn(int ncid, int rank, int *varid)
                nc_err_code_name(err));
         nerrs++;
     }
+    CHECK_NERRS_ALL
 
     /* only rank 0, 1, 2, and 3 do I/O:
      * each of ranks 0 to 3 write 4 nonblocking requests */
@@ -318,18 +323,37 @@ test_varn(int ncid, int rank, int *varid)
         }
 
         /* allocate I/O buffer and initialize its contents */
-        buffer[i] = (long long*) malloc(sizeof(long long) * req_lens[i]);
-        for (j=0; j<req_lens[i]; j++) buffer[i][j] = rank+10;
+        wbuf[i] = (long long*) malloc(sizeof(long long) * req_lens[i] * 2);
+        rbuf[i] = (long long*) malloc(sizeof(long long) * req_lens[i] * 2);
+        for (j=0; j<req_lens[i]*2; j++) wbuf[i][j] = rank+10;
+    }
+
+    /* try with buffer being a single contiguous space */
+    for (i=0; i<nreqs; i++) bufsize += req_lens[i];
+    if (bufsize>0) {
+        c_wbuf[0] = (long long*) malloc(sizeof(long long) * bufsize);
+        c_rbuf[0] = (long long*) malloc(sizeof(long long) * bufsize);
+        for (i=1; i<nreqs; i++) {
+            c_wbuf[i] = c_wbuf[i-1] + req_lens[i-1];
+            c_rbuf[i] = c_rbuf[i-1] + req_lens[i-1];
+        }
+        for (i=0; i<bufsize; i++) c_wbuf[0][i] = rank+10;
+    }
+    else {
+        for (i=0; i<4; i++) c_wbuf[i] = NULL;
+        for (i=0; i<4; i++) c_rbuf[i] = NULL;
     }
 
     /* write using varn API */
     nerrs += clear_file_contents(ncid, varid);
+    CHECK_NERRS_ALL
     for (i=0; i<nreqs; i++) {
         err = ncmpi_iput_varn_longlong(ncid, varid[i], my_nsegs[i], starts[i],
-                                       counts[i], buffer[i], &reqs[i]);
+                                       counts[i], wbuf[i], &reqs[i]);
         CHECK_ERR
     }
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+    CHECK_NERRS_ALL
     err = ncmpi_wait_all(ncid, nreqs, reqs, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -337,35 +361,29 @@ test_varn(int ncid, int rank, int *varid)
     /* check if write buffer contents have been altered */
     for (i=0; i<nreqs; i++) {
         for (j=0; j<req_lens[i]; j++) {
-            if (buffer[i][j] != rank+10) {
+            if (wbuf[i][j] != rank+10) {
                 printf("Error at line %d in %s: put buffer altered buffer[%d][%d]=%lld\n",
-                __LINE__,__FILE__,i,j,buffer[i][j]);
+                __LINE__,__FILE__,i,j,wbuf[i][j]);
                 nerrs++;
+                i = nreqs; break;
             }
         }
     }
+    CHECK_NERRS_ALL
 
     /* all processes read entire variables back and check contents */
     nerrs += check_contents_for_fail(ncid, varid);
-
-    /* try with buffer being a single contiguous space */
-    for (i=0; i<nreqs; i++) bufsize += req_lens[i];
-    if (bufsize>0) {
-        cbuffer[0] = (long long*) malloc(sizeof(long long) * bufsize);
-        for (i=1; i<nreqs; i++) cbuffer[i] = cbuffer[i-1] + req_lens[i-1];
-        for (i=0; i<bufsize; i++) cbuffer[0][i] = rank+10;
-    }
-    else
-        for (i=0; i<4; i++) cbuffer[i] = NULL;
+    CHECK_NERRS_ALL
 
     /* write usning varn API */
     nerrs += clear_file_contents(ncid, varid);
     for (i=0; i<nreqs; i++) {
         err = ncmpi_iput_varn_longlong(ncid, varid[i], my_nsegs[i], starts[i],
-                                       counts[i], cbuffer[i], &reqs[i]);
+                                       counts[i], c_wbuf[i], &reqs[i]);
         CHECK_ERR
     }
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+    CHECK_NERRS_ALL
     err = ncmpi_wait_all(ncid, nreqs, reqs, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -373,16 +391,19 @@ test_varn(int ncid, int rank, int *varid)
     /* check if write buffer contents have been altered */
     for (i=0; i<nreqs; i++) {
         for (j=0; j<req_lens[i]; j++) {
-            if (cbuffer[i][j] != rank+10) {
+            if (c_wbuf[i][j] != rank+10) {
                 printf("Error at line %d in %s: put buffer altered buffer[%d][%d]=%lld\n",
-                __LINE__,__FILE__,i,j,cbuffer[i][j]);
+                __LINE__,__FILE__,i,j,c_wbuf[i][j]);
                 nerrs++;
+                i = nreqs; break;
             }
         }
     }
+    CHECK_NERRS_ALL
 
     /* all processes read entire variables back and check contents */
     nerrs += check_contents_for_fail(ncid, varid);
+    CHECK_NERRS_ALL
 
     /* permute write order: so starts[*] are not in an increasing order:
      * swap segment 0 with segment 2 and swap segment 1 with segment 3
@@ -392,14 +413,17 @@ test_varn(int ncid, int rank, int *varid)
         permute(starts[i][1], starts[i][3]); permute(counts[i][1], counts[i][3]);
     }
 
-    /* write usning varn API */
     nerrs += clear_file_contents(ncid, varid);
+    CHECK_NERRS_ALL
+
+    /* write usning varn API */
     for (i=0; i<nreqs; i++) {
         err = ncmpi_iput_varn_longlong(ncid, varid[i], my_nsegs[i], starts[i],
-                                       counts[i], buffer[i], &reqs[i]);
+                                       counts[i], wbuf[i], &reqs[i]);
         CHECK_ERR
     }
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+    CHECK_NERRS_ALL
     err = ncmpi_wait_all(ncid, nreqs, reqs, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -407,26 +431,30 @@ test_varn(int ncid, int rank, int *varid)
     /* check if write buffer contents have been altered */
     for (i=0; i<nreqs; i++) {
         for (j=0; j<req_lens[i]; j++) {
-            if (buffer[i][j] != rank+10) {
+            if (wbuf[i][j] != rank+10) {
                 printf("Error at line %d in %s: put buffer altered buffer[%d][%d]=%lld\n",
-                __LINE__,__FILE__,i,j,buffer[i][j]);
+                __LINE__,__FILE__,i,j,wbuf[i][j]);
                 nerrs++;
+                i = nreqs; break;
             }
         }
     }
+    CHECK_NERRS_ALL
 
     /* all processes read entire variables back and check contents */
     nerrs += check_contents_for_fail(ncid, varid);
+    CHECK_NERRS_ALL
 
     /* read using get_varn API and check contents */
     for (i=0; i<nreqs; i++) {
-        for (j=0; j<req_lens[i]; j++) buffer[i][j] = -1;
+        for (j=0; j<req_lens[i]; j++) rbuf[i][j] = -1;
         err = ncmpi_iget_varn_longlong(ncid, varid[i], my_nsegs[i], starts[i],
-                                       counts[i], buffer[i], &reqs[i]);
+                                       counts[i], rbuf[i], &reqs[i]);
         CHECK_ERR
     }
 
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+    CHECK_NERRS_ALL
     err = ncmpi_wait_all(ncid, nreqs, reqs, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -434,31 +462,31 @@ test_varn(int ncid, int rank, int *varid)
     /* check if read buffer contents are expected */
     for (i=0; i<nreqs; i++) {
         for (j=0; j<req_lens[i]; j++) {
-            if (buffer[i][j] != rank+10) {
+            if (rbuf[i][j] != rank+10) {
                 printf("Error at line %d in %s: expecting var %d buffer[%d][%d]=%d but got %lld\n",
-                       __LINE__,__FILE__,varid[i],i,j,rank+10,buffer[i][j]);
+                       __LINE__,__FILE__,varid[i],i,j,rank+10,rbuf[i][j]);
                 nerrs++;
+                i = nreqs; break;
             }
         }
     }
-
-    for (i=0; i<nreqs; i++) free(buffer[i]);
+    CHECK_NERRS_ALL
 
     /* test flexible put API, using a noncontiguous buftype */
     nerrs += clear_file_contents(ncid, varid);
+    CHECK_NERRS_ALL
     for (i=0; i<nreqs; i++) {
         MPI_Datatype buftype;
         MPI_Type_vector(req_lens[i], 1, 2, MPI_LONG_LONG, &buftype);
         MPI_Type_commit(&buftype);
-        buffer[i] = (long long*) malloc(sizeof(long long) * req_lens[i] * 2);
-        for (j=0; j<req_lens[i]*2; j++) buffer[i][j] = rank+10;
 
         err = ncmpi_iput_varn(ncid, varid[i], my_nsegs[i], starts[i],
-                              counts[i], buffer[i], 1, buftype, &reqs[i]);
+                              counts[i], wbuf[i], 1, buftype, &reqs[i]);
         CHECK_ERR
         MPI_Type_free(&buftype);
     }
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+    CHECK_NERRS_ALL
     err = ncmpi_wait_all(ncid, nreqs, reqs, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -466,29 +494,33 @@ test_varn(int ncid, int rank, int *varid)
     /* check if write buffer contents have been altered */
     for (i=0; i<nreqs; i++) {
         for (j=0; j<req_lens[i]*2; j++) {
-            if (buffer[i][j] != rank+10) {
+            if (wbuf[i][j] != rank+10) {
                 printf("Error at line %d in %s: put buffer altered buffer[%d][%d]=%lld\n",
-                __LINE__,__FILE__,i,j,buffer[i][j]);
+                __LINE__,__FILE__,i,j,wbuf[i][j]);
                 nerrs++;
+                i = nreqs; break;
             }
         }
     }
+    CHECK_NERRS_ALL
 
     /* all processes read entire variables back and check contents */
     nerrs += check_contents_for_fail(ncid, varid);
+    CHECK_NERRS_ALL
 
     /* test flexible get API, using a noncontiguous buftype */
     for (i=0; i<nreqs; i++) {
         MPI_Datatype buftype;
         MPI_Type_vector(req_lens[i], 1, 2, MPI_LONG_LONG, &buftype);
         MPI_Type_commit(&buftype);
-        for (j=0; j<req_lens[i]*2; j++) buffer[i][j] = -1;
+        for (j=0; j<req_lens[i]*2; j++) rbuf[i][j] = -1;
         err = ncmpi_iget_varn(ncid, varid[i], my_nsegs[i], starts[i],
-                              counts[i], buffer[i], 1, buftype, &reqs[i]);
+                              counts[i], rbuf[i], 1, buftype, &reqs[i]);
         CHECK_ERR
         MPI_Type_free(&buftype);
     }
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+    CHECK_NERRS_ALL
     err = ncmpi_wait_all(ncid, nreqs, reqs, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -496,18 +528,21 @@ test_varn(int ncid, int rank, int *varid)
     /* check if read buffer contents are expected */
     for (i=0; i<nreqs; i++) {
         for (j=0; j<req_lens[i]*2; j++) {
-            if (j%2 && buffer[i][j] != -1) {
+            if (j%2 && rbuf[i][j] != -1) {
                 printf("Error at line %d in %s: expecting buffer[%d][%d]=-1 but got %lld\n",
-                       __LINE__,__FILE__,i,j,buffer[i][j]);
+                       __LINE__,__FILE__,i,j,rbuf[i][j]);
                 nerrs++;
+                i = nreqs; break;
             }
-            if (j%2 == 0 && buffer[i][j] != rank+10) {
+            if (j%2 == 0 && rbuf[i][j] != rank+10) {
                 printf("Error at line %d in %s: expecting buffer[%d][%d]=%d but got %lld\n",
-                       __LINE__,__FILE__,i,j,rank+10,buffer[i][j]);
+                       __LINE__,__FILE__,i,j,rank+10,rbuf[i][j]);
                 nerrs++;
+                i = nreqs; break;
             }
         }
     }
+    CHECK_NERRS_ALL
 
     /* read back using a contiguous buffer. First swap back the starts[] and
      * counts[]. swap segment 0 with segment 2 and swap segment 1 with segment
@@ -520,14 +555,14 @@ test_varn(int ncid, int rank, int *varid)
         permute(counts[i][1], counts[i][3]);
     }
 
-    for (i=0; i<bufsize; i++) cbuffer[0][i] = -1;
     for (i=0; i<nreqs; i++) {
-        for (j=0; j<req_lens[i]; j++) buffer[i][j] = -1;
+        for (j=0; j<req_lens[i]; j++) c_rbuf[i][j] = -1;
         err = ncmpi_iget_varn_longlong(ncid, varid[i], my_nsegs[i], starts[i],
-                                       counts[i], cbuffer[i], &reqs[i]);
+                                       counts[i], c_rbuf[i], &reqs[i]);
         CHECK_ERR
     }
     nerrs += check_num_pending_reqs(ncid, nreqs, __LINE__);
+    CHECK_NERRS_ALL
     err = ncmpi_wait_all(ncid, nreqs, reqs, sts);
     CHECK_ERR
     ERRS(nreqs, sts)
@@ -535,16 +570,23 @@ test_varn(int ncid, int rank, int *varid)
     /* check if read buffer contents are expected */
     for (i=0; i<nreqs; i++) {
         for (j=0; j<req_lens[i]; j++) {
-            if (cbuffer[i][j] != rank+10) {
+            if (c_rbuf[i][j] != rank+10) {
                 printf("Error at line %d in %s: expecting buffer[%d][%d]=%d but got %lld\n",
-                       __LINE__,__FILE__,i,j,rank+10,cbuffer[i][j]);
+                       __LINE__,__FILE__,i,j,rank+10,c_rbuf[i][j]);
                 nerrs++;
+                i = nreqs; break;
             }
         }
     }
+    CHECK_NERRS_ALL
 
-    if (bufsize>0) free(cbuffer[0]);
-    for (i=0; i<nreqs; i++) free(buffer[i]);
+fn_exit:
+    if (c_wbuf[0] != NULL) free(c_wbuf[0]);
+    if (c_rbuf[0] != NULL) free(c_rbuf[0]);
+    for (i=0; i<nreqs; i++) {
+        if (wbuf[i] != NULL) free(wbuf[i]);
+        if (rbuf[i] != NULL) free(rbuf[i]);
+    }
     free(starts[0][0]);
     free(counts[0][0]);
     free(starts[0]);
diff --git a/test/nonblocking/interleaved.c b/test/nonblocking/interleaved.c
index 606b74be5..42321bcb4 100644
--- a/test/nonblocking/interleaved.c
+++ b/test/nonblocking/interleaved.c
@@ -87,8 +87,8 @@ int main(int argc, char** argv)
 {
     char filename[256];
     int i, j, rank, nprocs, err, nerrs=0, expected;
-    int ncid, cmode, varid[2], dimid[2], req[4], st[4], *buf;
-    int *buf0, *buf1, *buf2;
+    int ncid, cmode, varid[2], dimid[2], req[4], st[4], *buf=NULL;
+    int *buf0=NULL, *buf1=NULL, *buf2=NULL;
     size_t len;
     MPI_Offset start[2], count[2];
     MPI_Info info;
@@ -124,7 +124,7 @@ int main(int argc, char** argv)
 
     /* create a new file for writing ----------------------------------------*/
     cmode = NC_CLOBBER | NC_64BIT_DATA;
-    err = ncmpi_create(MPI_COMM_SELF, filename, cmode, info, &ncid); CHECK_ERR
+    err = ncmpi_create(MPI_COMM_SELF, filename, cmode, info, &ncid); CHECK_FATAL_ERR
 
     MPI_Info_free(&info);
 
@@ -184,6 +184,7 @@ int main(int argc, char** argv)
 
     err = ncmpi_wait_all(ncid, 3, req, st); CHECK_ERR
     free(buf0); free(buf1); free(buf2);
+    buf0 = buf1 = buf2 = NULL;
 
     /* fill the entire variable var1 with -1s */
     for (i=0; i<NY*NX; i++) buf[i] = -1;
@@ -238,11 +239,12 @@ int main(int argc, char** argv)
     for (i=25; i<30; i++) CHECK_CONTENTS(buf, 10 + i)
 
     err = ncmpi_close(ncid); CHECK_ERR
-    free(buf0);
+    if (buf0 != NULL) free(buf0);
+    buf0 = NULL;
 
     /* open the same file and read back for validate */
     err = ncmpi_open(MPI_COMM_SELF, filename, NC_NOWRITE, MPI_INFO_NULL,
-                     &ncid); CHECK_ERR
+                     &ncid); CHECK_FATAL_ERR
 
     err = ncmpi_inq_varid(ncid, "var0", &varid[0]); CHECK_ERR
     err = ncmpi_inq_varid(ncid, "var1", &varid[1]); CHECK_ERR
@@ -256,9 +258,10 @@ int main(int argc, char** argv)
     for (j=0; j<8; j++) {
         for (i=3; i<5; i++) {
             if (buf[j*NX+i] != expected) {
-                printf("%d: Unexpected read buf[%d][%d]=%d, should be %d\n",
-                       rank, j, i, buf[j*NX+i], expected);
+                printf("%d at line %d: var0 read buf[%d][%d] expect %d but got %d\n",
+                       rank, __LINE__, j, i, expected, buf[j*NX+i]);
                 nerrs++;
+                goto err_out;
             }
             expected++;
         }
@@ -267,9 +270,10 @@ int main(int argc, char** argv)
     j = 1;
     for (i=8; i<13; i++) {
         if (buf[j*NX+i] != expected) {
-            printf("%d: Unexpected read buf[%d][%d]=%d, should be %d\n",
-                   rank, j, i, buf[j*NX+i], expected);
+            printf("%d at line %d: var0 read buf[%d][%d] expect %d but got %d\n",
+                   rank, __LINE__, j, i, expected, buf[j*NX+i]);
             nerrs++;
+            goto err_out;
         }
         expected++;
     }
@@ -277,9 +281,10 @@ int main(int argc, char** argv)
     j = 3;
     for (i=7; i<12; i++) {
         if (buf[j*NX+i] != expected) {
-            printf("%d: Unexpected read buf[%d][%d]=%d, should be %d\n",
-                   rank, j, i, buf[j*NX+i], expected);
+            printf("%d at line %d: var0 read buf[%d][%d] expect %d but got %d\n",
+                   rank, __LINE__, j, i, expected, buf[j*NX+i]);
             nerrs++;
+            goto err_out;
         }
         expected++;
     }
@@ -295,9 +300,10 @@ int main(int argc, char** argv)
     for (j=6; j<9; j++) {
         for (i=7; i<17; i++) {
             if (buf[j*NX+i] != expected) {
-                printf("%d: Unexpected read buf[%d]=%d, should be %d\n",
-                       rank, i, buf[j*NX+i], expected);
+                printf("%d at line %d: var1 read buf[%d][%d] expect %d but got %d\n",
+                       rank, __LINE__, j, i, expected, buf[j*NX+i]);
                 nerrs++;
+                goto err_out;
             }
             expected++;
         }
@@ -306,17 +312,19 @@ int main(int argc, char** argv)
     for (j=0; j<8; j++) {
         for (i=3; i<5; i++) {
             if (buf[j*NX+i] != expected) {
-                printf("%d: Unexpected read buf[%d][%d]=%d, should be %d\n",
-                       rank, j, i, buf[j*NX+i], expected);
+                printf("%d at line %d: var1 read buf[%d][%d] expect %d but got %d\n",
+                       rank, __LINE__, j, i, expected, buf[j*NX+i]);
                 nerrs++;
+                goto err_out;
             }
             expected++;
         }
     }
 
+err_out:
     err = ncmpi_close(ncid); CHECK_ERR
 
-    free(buf);
+    if (buf != NULL) free(buf);
 
     /* check if PnetCDF freed all internal malloc */
     MPI_Offset malloc_size;
diff --git a/test/nonblocking/parallel_run.sh b/test/nonblocking/parallel_run.sh
index 25ffe7650..f70c02c6d 100755
--- a/test/nonblocking/parallel_run.sh
+++ b/test/nonblocking/parallel_run.sh
@@ -18,7 +18,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "check_PROGRAMS=${check_PROGRAMS}"
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -29,22 +29,66 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+        OUT_PREFIX="${TESTOUTDIR}/$i"
+
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.mpio"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.pncio"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}.ina"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}"
+        fi
+
+        OUT_FILE=$INA_OUT_FILE
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
         fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
-        ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
+        if test "$i" = mcoll_perf ; then
+           # echo "${MPIRUN} ./$i ${OUT_FILE}"
+           ${MPIRUN} ./$i ${OUT_FILE}
 
-        if test "$i" != mcoll_perf ; then
-           # echo "--- validating file ${TESTOUTDIR}/$i.nc"
-           ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc
+           for j in `seq 0 9` ; do
+              ext="2.4.$j.nc"
+              # echo "--- validating file ${OUT_FILE}.$ext"
+              ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.$ext
+           done
+        else
+           # echo "${MPIRUN} ./$i ${OUT_FILE}.nc"
+           ${MPIRUN} ./$i ${OUT_FILE}.nc
         fi
         # echo ""
 
@@ -52,33 +96,57 @@ for i in ${check_PROGRAMS} ; do
            # echo "---- test burst buffering feature"
            saved_PNETCDF_HINTS=${PNETCDF_HINTS}
            export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
-           ${MPIRUN} ./$i ${TESTOUTDIR}/$i.bb.nc
+           ${MPIRUN} ./$i ${OUT_FILE}.bb.nc
            export PNETCDF_HINTS=${saved_PNETCDF_HINTS}
 
            if test "$i" = mcoll_perf ; then
               continue
            fi
 
-           # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc"
-           ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc
+           # echo "--- validating file ${OUT_FILE}.bb.nc"
+           ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc
 
            # burst buffering does not support nonblocking requests in define mode
            if test $i != "i_varn_indef" ; then
-              # echo "--- ncmpidiff $i.nc $i.bb.nc ---"
-              ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc
+              # echo "--- ncmpidiff $${OUT_FILE}.nc ${OUT_FILE}.bb.nc ---"
+              ${MPIRUN} ${NCMPIDIFF} -q ${OUT_FILE}.nc ${OUT_FILE}.bb.nc
            fi
         fi
 
         if test "x${ENABLE_NETCDF4}" = x1 ; then
            # echo "test netCDF-4 feature"
-           ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc4 4
+           ${MPIRUN} ./$i ${OUT_FILE}.nc4 4
            # Validator does not support nc4
         fi
-    done
-    done
-    rm -f ${OUTDIR}/$i.nc
-    rm -f ${OUTDIR}/$i.bb.nc
-    rm -f ${OUTDIR}/$i.nc.*
-    rm -f ${OUTDIR}/$i.bb.nc.*
+    done # intra_aggr
+    done # mpiio_mode
+
+    if [[ "$i" == *"vard"* ]] ; then
+       continue
+    fi
+
+    DIFF_OPT="-q"
+
+    if test "$i" = mcoll_perf ; then
+       for j in `seq 0 9` ; do
+          ext="2.4.$j.nc"
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.$ext $OUT_PREFIX.mpio.ina.$ext ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.$ext $OUT_PREFIX.mpio.ina.$ext
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.$ext $OUT_PREFIX.pncio.$ext ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.$ext $OUT_PREFIX.pncio.$ext
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.$ext $OUT_PREFIX.pncio.ina.$ext ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.$ext $OUT_PREFIX.pncio.ina.$ext
+       done
+    else
+       # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc
+       # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc
+       # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc
+    fi
+
+    done # safe_modes
+    rm -f ${OUTDIR}/$i*nc*
 done
 
diff --git a/test/nonblocking/wrap_runs.sh b/test/nonblocking/wrap_runs.sh
index 3fdd5e99c..9db9949cb 100755
--- a/test/nonblocking/wrap_runs.sh
+++ b/test/nonblocking/wrap_runs.sh
@@ -26,8 +26,30 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.nc
     ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc
     # echo ""
@@ -36,7 +58,7 @@ for j in ${safe_modes} ; do
        echo ""
        echo "---- testing burst buffering"
 
-       export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+       export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
        ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.bb.nc
        unset PNETCDF_HINTS
        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc
@@ -48,6 +70,7 @@ for j in ${safe_modes} ; do
        fi
    fi
 done
+done
 rm -f ${OUTDIR}/$outfile.nc
 rm -f ${OUTDIR}/$outfile.bb.nc
 
diff --git a/test/parallel_run.sh b/test/parallel_run.sh
new file mode 100755
index 000000000..694cea504
--- /dev/null
+++ b/test/parallel_run.sh
@@ -0,0 +1,295 @@
+#!/bin/bash
+#
+# Copyright (C) 2018, Northwestern University and Argonne National Laboratory
+# See COPYRIGHT notice in top-level directory.
+#
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+
+VALIDATOR=../../src/utils/ncvalidator/ncvalidator
+NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff
+
+MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
+# echo "MPIRUN = ${MPIRUN}"
+# echo "check_PROGRAMS=${check_PROGRAMS}"
+
+# remove file system type prefix if there is any
+OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-`
+
+# let NTHREADS=$1*6-1
+NTHREADS=`expr $1 \* 6 - 1`
+
+# echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
+if test "x${PNETCDF_DEBUG}" = x1 ; then
+   safe_modes="0 1"
+else
+   safe_modes="0"
+fi
+
+# prevent user environment setting of PNETCDF_HINTS to interfere
+unset PNETCDF_HINTS
+
+fixed_length=23
+
+for i in ${check_PROGRAMS} ; do
+    if test "$i" = tst_io ; then
+       # this is designed to run 1 process
+       continue
+    fi
+    if test "$i" = tst_version ; then
+       # this program read only and creates no output file
+       ${MPIRUN} ./$i
+       continue
+    fi
+    if test "$i" = tst_open_cdf5 ; then
+       # this program read only and creates no output file
+       ${MPIRUN} ./$i ${srcdir}/bad_begin.nc5
+       continue
+    fi
+    if test "$i" = tst_corrupt ; then
+       # this program read only and creates no output file
+       ${MPIRUN} ./$i ${srcdir}
+       continue
+    fi
+
+    for j in ${safe_modes} ; do
+        if test "$j" = 1 ; then # test only in safe mode
+           SAFE_HINTS="romio_no_indep_rw=true"
+           safe_hint="  SAFE"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+           safe_hint="NOSAFE"
+        fi
+        OUT_PREFIX="${TESTOUTDIR}/$i"
+
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.mpio"
+           driver_hint=" MPIO"
+        else
+           USEMPIO_HINTS="nc_pncio=enable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.pncio"
+           driver_hint="PNCIO"
+        fi
+    for intra_aggr in 0 1 ; do
+        if test "$intra_aggr" = 1 ; then
+           INA_HINTS="nc_num_aggrs_per_node=2"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}.ina"
+           ina_hint="  INA"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}"
+           ina_hint="NOINA"
+        fi
+
+        OUT_FILE=$INA_OUT_FILE
+        TEST_OPTS="$safe_hint $driver_hint $ina_hint"
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
+        export PNETCDF_SAFE_MODE=$j
+        # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
+        if test "$i" = tst_pthread ; then
+           # each MPI process created 6 threads
+           ${MPIRUN} ./$i ${OUT_FILE}.nc
+           for k in `seq 0 ${NTHREADS}` ; do
+               ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc.$k
+               rm -f ${OUTDIR}/tst_pthread.nc.$k
+           done
+           continue
+        elif test "$i" = pres_temp_4D_wr ; then
+           ${MPIRUN} ./$i ${OUT_FILE}.nc
+           # echo "--- validating file ${OUT_FILE}.nc"
+           ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc
+
+           ${MPIRUN} ./pres_temp_4D_rd ${OUT_FILE}.nc
+        elif test "$i" = pres_temp_4D_rd ; then
+           continue
+        elif test "$i" = test_inq_format ; then
+           ${MPIRUN} ./$i ${srcdir}
+           continue
+        elif test "$i" = "tst_cdl_hdr_parser" ; then
+           ${MPIRUN} ./$i -q -o ${OUT_FILE}.nc ${srcdir}/cdl_header.txt
+           continue
+        elif test "$i" = mcoll_perf ; then
+           # echo "${LINENO}: ${MPIRUN} ./$i ${OUT_FILE}"
+           ${MPIRUN} ./$i ${OUT_FILE}
+        else
+           # echo "${LINENO}: ${MPIRUN} ./$i ${OUT_FILE}.nc"
+           ${MPIRUN} ./$i ${OUT_FILE}.nc
+        fi
+
+        # put_all_kinds and iput_all_kinds output 3 files
+        if test "$i" = put_all_kinds -o "$i" = iput_all_kinds ; then
+           for k in 1 2 5 ; do
+               # echo "--- validating file ${OUT_FILE}.nc$k"
+               ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc$k
+           done
+        elif test "$i" = mcoll_perf ; then
+           for j in `seq 0 9` ; do
+              ext="2.4.$j.nc"
+              # echo "${LINENO}:--- validating file ${OUT_FILE}.$ext"
+              ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.$ext
+           done
+        else
+           # echo "${LINENO}:--- validating file ${OUT_FILE}.nc"
+           ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc
+        fi
+
+        if test "x${ENABLE_BURST_BUFFER}" = x1 ; then
+           # echo "---- test burst buffering feature"
+           saved_PNETCDF_HINTS=${PNETCDF_HINTS}
+           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+           if test "$i" = mcoll_perf ; then
+              # echo "${LINENO}:--- ${MPIRUN} ./$i ${OUT_FILE}.bb"
+              ${MPIRUN} ./$i ${OUT_FILE}.bb
+           else
+              ${MPIRUN} ./$i ${OUT_FILE}.bb.nc
+           fi
+           export PNETCDF_HINTS=${saved_PNETCDF_HINTS}
+
+           # put_all_kinds and iput_all_kinds output 3 files
+           if test "$i" = put_all_kinds -o "$i" = iput_all_kinds ; then
+              for k in 1 2 5 ; do
+                  # echo "--- validating file ${OUT_FILE}.bb.nc$k"
+                  ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc$k
+                  # echo "--- ncmpidiff ${OUT_FILE}.nc$k ${OUT_FILE}.bb.nc$k ---"
+                  ${MPIRUN} ${NCMPIDIFF} -q ${OUT_FILE}.nc$k ${OUT_FILE}.bb.nc$k
+              done
+              continue
+           elif test "$i" = mcoll_perf ; then
+              for j in `seq 0 9` ; do
+                  ext="2.4.$j.nc"
+                  bb_ext="bb.2.4.$j.nc"
+                  # echo "${LINENO}:--- validating file ${OUT_FILE}.$bb_ext"
+                  ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.$bb_ext
+                  # echo "${LINENO}:--- ncmpidiff -q ${OUT_FILE}.$ext ${OUT_FILE}.$bb_ext"
+                  ${MPIRUN} ${NCMPIDIFF} -q ${OUT_FILE}.$ext ${OUT_FILE}.$bb_ext
+              done
+              continue
+           else
+              # echo "--- validating file ${OUT_FILE}.bb.nc"
+              ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc
+           fi
+
+           # compare file header only for large file tests
+           DIFF_OPT="-q"
+           if test "$i" = last_large_var ||
+              test "$i" = dim_cdf12 ||
+              test "$i" = tst_cdl_hdr_parser ||
+              test "$i" = bigrecords ||
+              test "$i" = high_dim_var ||
+              test "$i" = large_attr ||
+              test "$i" = large_coalesce ||
+              test "$i" = large_dims_vars_attrs ||
+              test "$i" = large_files ||
+              test "$i" = large_header ||
+              test "$i" = large_reqs ||
+              test "$i" = large_var ||
+              test "$i" = tst_cdf5_begin ||
+              test "$i" = tst_flarge ||
+              test "$i" = tst_hash_large_ndims ||
+              test "$i" = tst_hash_large_ngattrs ||
+              test "$i" = tst_hash_large_nvars ; then
+              DIFF_OPT+=" -h"
+           fi
+           # echo "${LINENO}: --- ncmpidiff $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc ---"
+           ${MPIRUN} ${NCMPIDIFF} $DIFF_OPT $OUT_FILE.nc $OUT_FILE.bb.nc
+        fi
+
+        if test "x${ENABLE_NETCDF4}" = x1 ; then
+           # echo "test netCDF-4 feature"
+           ${MPIRUN} ./$i ${OUT_FILE}.nc4 4
+           # Validator does not support nc4
+        fi
+    done # intra_aggr
+    done # mpiio_mode
+
+    if [[ "$i" == *"vard"* ]] ; then
+       continue
+    fi
+
+    DIFF_OPT="-q"
+    if test "$i" = last_large_var ||
+       test "$i" = dim_cdf12 ||
+       test "$i" = tst_cdl_hdr_parser ||
+       test "$i" = bigrecords ||
+       test "$i" = high_dim_var ||
+       test "$i" = large_attr ||
+       test "$i" = large_coalesce ||
+       test "$i" = large_dims_vars_attrs ||
+       test "$i" = large_files ||
+       test "$i" = large_header ||
+       test "$i" = large_reqs ||
+       test "$i" = large_var ||
+       test "$i" = tst_cdf5_begin ||
+       test "$i" = tst_flarge ||
+       test "$i" = tst_hash_large_ndims ||
+       test "$i" = tst_hash_large_ngattrs ||
+       test "$i" = tst_hash_large_nvars ; then
+       DIFF_OPT+=" -h"
+    fi
+    if test "$i" = pres_temp_4D_rd || test "$i" = test_inq_format ; then
+       continue
+    fi
+    if test "$i" = put_all_kinds || test "$i" = iput_all_kinds ; then
+       for j in 1 2 5; do
+          # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.mpio.ina.nc$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.mpio.ina.nc$j
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.nc$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.nc$j
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.ina.nc$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.ina.nc$j
+       done
+    elif test "$i" = tst_pthread ; then
+       for j in `seq 0 ${NTHREADS}` ; do
+          # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.mpio.ina.nc.$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.mpio.ina.nc.$j
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.nc.$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.nc.$j
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.ina.nc.$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.ina.nc.$j
+       done
+    elif test "$i" = mcoll_perf ; then
+       for j in `seq 0 9` ; do
+          ext="2.4.$j.nc"
+          # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.$ext $OUT_PREFIX.mpio.ina.$ext ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.$ext $OUT_PREFIX.mpio.ina.$ext
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.$ext $OUT_PREFIX.pncio.$ext ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.$ext $OUT_PREFIX.pncio.$ext
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.$ext $OUT_PREFIX.pncio.ina.$ext ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.$ext $OUT_PREFIX.pncio.ina.$ext
+       done
+    else
+       # echo "${LINENO}: --- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc
+       # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc
+       # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc
+    fi
+
+    done # safe_modes
+    rm -f ${OUTDIR}/$i*nc*
+done # check_PROGRAMS
+
diff --git a/test/subfile/parallel_run.sh b/test/subfile/parallel_run.sh
index dea9874ad..dac89590c 100755
--- a/test/subfile/parallel_run.sh
+++ b/test/subfile/parallel_run.sh
@@ -18,7 +18,7 @@ MPIRUN=`echo ${TESTMPIRUN} | ${SED} -e "s/NP/$1/g"`
 # echo "check_PROGRAMS=${check_PROGRAMS}"
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -29,17 +29,46 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+        fi
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
         fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+	    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
         ${MPIRUN} ./$i -f ${TESTOUTDIR}/$i.nc -s 2
 
         # echo "--- validating file ${TESTOUTDIR}/$i.nc"
@@ -76,6 +105,7 @@ for i in ${check_PROGRAMS} ; do
         fi
     done
     done
+    done
     rm -f ${OUTDIR}/$i.nc
     rm -f ${OUTDIR}/$i.nc.subfile_0.nc
     rm -f ${OUTDIR}/$i.nc.subfile_1.nc
diff --git a/test/subfile/seq_runs.sh b/test/subfile/seq_runs.sh
index 46a34d384..aa31d5e3e 100755
--- a/test/subfile/seq_runs.sh
+++ b/test/subfile/seq_runs.sh
@@ -25,8 +25,30 @@ unset PNETCDF_HINTS
 
 for i in ${TESTPROGRAMS} ; do
     for j in ${safe_modes} ; do
+        if test "$j" = 1 ; then # test only in safe mode
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
+        else
+           USEMPIO_HINTS="nc_pncio=enable"
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+        # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
         ${TESTSEQRUN} ./$i -f ${TESTOUTDIR}/$i.nc -s 2
 
         # echo "--- validating file ${TESTOUTDIR}/$i.nc"
@@ -55,6 +77,7 @@ for i in ${TESTPROGRAMS} ; do
            ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc.subfile_0.nc ${TESTOUTDIR}/$i.bb.nc.subfile_0.nc
         fi
     done
+    done
     rm -f ${OUTDIR}/$i.nc
     rm -f ${OUTDIR}/$i.nc.subfile_0.nc
     rm -f ${OUTDIR}/$i.bb.nc
diff --git a/test/test_installed/README.md b/test/test_installed/README.md
index 62987cfd7..37b57dd17 100644
--- a/test/test_installed/README.md
+++ b/test/test_installed/README.md
@@ -109,12 +109,6 @@ distribution. All test programs are designed to run on 4 MPI processes.
   cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib  test_fillvalue.o  testutils.o -lpnetcdf -o test_fillvalue
   cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/test_get_varn.c
   cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib  test_get_varn.o  testutils.o -lpnetcdf -o test_get_varn
-  cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/test_vard.c
-  cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib  test_vard.o  testutils.o -lpnetcdf -o test_vard
-  cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/test_vard_multiple.c
-  cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib  test_vard_multiple.o  testutils.o -lpnetcdf -o test_vard_multiple
-  cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/test_vard_rec.c
-  cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib  test_vard_rec.o  testutils.o -lpnetcdf -o test_vard_rec
   cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/test_varm.c
   cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib  test_varm.o  testutils.o -lpnetcdf -o test_varm
   cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../testcases/tst_def_var_fill.c
@@ -182,10 +176,6 @@ distribution. All test programs are designed to run on 4 MPI processes.
   cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib  transpose2D.o  testutils.o -lpnetcdf -o transpose2D
   cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../../examples/C/transpose.c
   cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib  transpose.o  testutils.o -lpnetcdf -o transpose
-  cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../../examples/C/vard_int.c
-  cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib  vard_int.o  testutils.o -lpnetcdf -o vard_int
-  cc -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -c ../../examples/C/vard_mvars.c
-  cc -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib  vard_mvars.o  testutils.o -lpnetcdf -o vard_mvars
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F77/block_cyclic.f -o block_cyclic.77o
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o block_cyclic.exe77 block_cyclic.77o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F77/bput_varn_int8.f -o bput_varn_int8.77o
@@ -214,8 +204,6 @@ distribution. All test programs are designed to run on 4 MPI processes.
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o time_var.exe77 time_var.77o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F77/transpose.f -o transpose.77o
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o transpose.exe77 transpose.77o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf
-  ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F77/vard_int.f -o vard_int.77o
-  ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o vard_int.exe77 vard_int.77o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F90/block_cyclic.f90 -o block_cyclic.90o
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o block_cyclic.exe90 block_cyclic.90o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F90/column_wise.f90 -o column_wise.90o
@@ -238,8 +226,6 @@ distribution. All test programs are designed to run on 4 MPI processes.
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o put_varn_real.exe90 put_varn_real.90o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F90/transpose.f90 -o transpose.90o
   ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o transpose.exe90 transpose.90o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf
-  ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -c ../../examples/F90/vard_int.f90 -o vard_int.90o
-  ftn -I/opt/cray/pe/parallel-netcdf/1.12.3.1/include -I../common -w -fallow-argument-mismatch -o vard_int.exe90 vard_int.90o utils.o -L/opt/cray/pe/parallel-netcdf/1.12.3.1/lib -lpnetcdf
   ```
   </details>
 
@@ -298,9 +284,6 @@ distribution. All test programs are designed to run on 4 MPI processes.
   *** TESTING C   test_erange for checking for NC_ERANGE             ------ pass
   *** TESTING C   test_fillvalue for _FillValue for NC_GLOBAL        ------ pass
   *** TESTING C   test_get_varn for get_varn                         ------ pass
-  *** TESTING C   test_vard for vard put and get                     ------ pass
-  *** TESTING C   test_vard_multiple for vard to 2 variables         ------ pass
-  *** TESTING C   test_vard_rec for vard put on record var           ------ pass
   *** TESTING C   test_varm for get/put varm                         ------ pass
   *** TESTING C   tst_def_var_fill for def_var_fill                  ------ pass
   *** TESTING C   tst_dimsizes for defining max dimension sizes      ------ pass
@@ -336,8 +319,6 @@ distribution. All test programs are designed to run on 4 MPI processes.
   *** TESTING C   examples/C/time_var                                ------ pass
   *** TESTING C   examples/C/transpose2D                             ------ pass
   *** TESTING C   examples/C/transpose                               ------ pass
-  *** TESTING C   examples/C/vard_int                                ------ pass
-  *** TESTING C   examples/C/vard_mvars                              ------ pass
   *** TESTING F77 examples/F77/block_cyclic.exe77                    ------ pass
   *** TESTING F77 examples/F77/bput_varn_int8.exe77                  ------ pass
   *** TESTING F77 examples/F77/column_wise.exe77                     ------ pass
@@ -352,7 +333,6 @@ distribution. All test programs are designed to run on 4 MPI processes.
   *** TESTING F77 examples/F77/put_varn_real.exe77                   ------ pass
   *** TESTING F77 examples/F77/time_var.exe77                        ------ pass
   *** TESTING F77 examples/F77/transpose.exe77                       ------ pass
-  *** TESTING F77 examples/F77/vard_int.exe77                        ------ pass
   *** TESTING F90 examples/F90/block_cyclic.exe90                    ------ pass
   *** TESTING F90 examples/F90/column_wise.exe90                     ------ pass
   *** TESTING F90 examples/F90/fill_mode.exe90                       ------ pass
@@ -364,7 +344,6 @@ distribution. All test programs are designed to run on 4 MPI processes.
   *** TESTING F90 examples/F90/put_varn_int.exe90                    ------ pass
   *** TESTING F90 examples/F90/put_varn_real.exe90                   ------ pass
   *** TESTING F90 examples/F90/transpose.exe90                       ------ pass
-  *** TESTING F90 examples/F90/vard_int.exe90                        ------ pass
 
   Total number of tested programs: 105
 
diff --git a/test/test_installed/makefile b/test/test_installed/makefile
index b643b4925..87b9a9a18 100644
--- a/test/test_installed/makefile
+++ b/test/test_installed/makefile
@@ -61,9 +61,6 @@ testcases_src = ../testcases/add_var.c \
                 ../testcases/test_erange.c \
                 ../testcases/test_fillvalue.c \
                 ../testcases/test_get_varn.c \
-                ../testcases/test_vard.c \
-                ../testcases/test_vard_multiple.c \
-                ../testcases/test_vard_rec.c \
                 ../testcases/test_varm.c \
                 ../testcases/tst_def_var_fill.c \
                 ../testcases/tst_del_attr.c \
@@ -109,10 +106,7 @@ examples_C_src = ../../examples/C/block_cyclic.c \
                  ../../examples/C/put_varn_int.c \
                  ../../examples/C/time_var.c \
                  ../../examples/C/transpose2D.c \
-                 ../../examples/C/transpose.c \
-                 ../../examples/C/vard_bottom.c \
-                 ../../examples/C/vard_int.c \
-                 ../../examples/C/vard_mvars.c
+                 ../../examples/C/transpose.c
 EXAMPLE_PROGS += $(examples_C_src:../../examples/C/%.c=%)
 %.o: ../../examples/C/%.c
 	$(CC) $(CFLAGS) -c $<
@@ -146,8 +140,7 @@ examples_F77_src = ../../examples/F77/block_cyclic.f \
                    ../../examples/F77/put_varn_int.f \
                    ../../examples/F77/put_varn_real.f \
                    ../../examples/F77/time_var.f \
-                   ../../examples/F77/transpose.f \
-                   ../../examples/F77/vard_int.f
+                   ../../examples/F77/transpose.f
 EXAMPLE_PROGS += $(examples_F77_src:../../examples/F77/%.f=%.exe77)
 
 %.77o: ../../examples/F77/%.f
@@ -163,8 +156,7 @@ examples_F90_src = ../../examples/F90/block_cyclic.f90 \
                    ../../examples/F90/put_var.f90 \
                    ../../examples/F90/put_varn_int.f90 \
                    ../../examples/F90/put_varn_real.f90 \
-                   ../../examples/F90/transpose.f90 \
-                   ../../examples/F90/vard_int.f90
+                   ../../examples/F90/transpose.f90
 EXAMPLE_PROGS += $(examples_F90_src:../../examples/F90/%.f90=%.exe90)
 
 all: env_check testutils.o utils.o $(TEST_PROGS) $(EXAMPLE_PROGS) batch.sh interactive.sh
diff --git a/test/testcases/Makefile.am b/test/testcases/Makefile.am
index 5c8a8a25c..41636d39f 100644
--- a/test/testcases/Makefile.am
+++ b/test/testcases/Makefile.am
@@ -98,7 +98,8 @@ TESTPROGRAMS = file_create_open \
                test_get_varn \
                tst_del_attr \
                tst_redefine \
-               tst_grow_header
+               tst_grow_header \
+               tst_varn_var1
 
 M4_SRCS      = put_all_kinds.m4 \
                erange_fill.m4 \
@@ -220,7 +221,7 @@ CLEANFILES = $(M4_SRCS:.m4=.c) core core.* *.gcda *.gcno *.gcov gmon.out \
              $(NC_FILES)
 
 EXTRA_DIST = $(M4_SRCS) seq_runs.sh redef-good.ncdump \
-             wrap_runs.sh parallel_run.sh
+             wrap_runs.sh
 
 # Some of these tests are designed to run on one process,
 # Run them on 4 processes to see if they can handle well
@@ -232,21 +233,21 @@ ptest ptest4: $(check_PROGRAMS)
 	@echo "    $(subdir): Parallel testing on 4 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 4 || exit 1
+	$(srcdir)/../parallel_run.sh 4 || exit 1
 
 ptest2: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 2 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 2 || exit 1
+	$(srcdir)/../parallel_run.sh 2 || exit 1
 
 ptest6: $(check_PROGRAMS)
 	@echo "==========================================================="
 	@echo "    $(subdir): Parallel testing on 6 MPI processes"
 	@echo "==========================================================="
 	@$(TESTS_ENVIRONMENT) \
-	$(srcdir)/parallel_run.sh 6 || exit 1
+	$(srcdir)/../parallel_run.sh 6 || exit 1
 
 ptests: ptest2 ptest4 ptest6
 ptest8 ptest10:
diff --git a/test/testcases/add_var.c b/test/testcases/add_var.c
index 002942014..27bcf8de0 100644
--- a/test/testcases/add_var.c
+++ b/test/testcases/add_var.c
@@ -58,6 +58,7 @@ tst_fmt(char *filename, int cmode)
     for (i=0; i<10; i++) {
         sprintf(var_name, "var_%d", i);
         err = ncmpi_def_var(ncid, var_name, NC_INT, 2, dimid, &varid); CHECK_ERR
+        err = ncmpi_def_var_fill(ncid, varid, 0, NULL); CHECK_ERR
     }
     err = ncmpi_enddef(ncid); CHECK_ERR
 
@@ -70,7 +71,9 @@ tst_fmt(char *filename, int cmode)
 
     /* add 2 new variables */
     err = ncmpi_def_var(ncid, "new_var1", NC_INT,   2, dimid, &varid); CHECK_ERR
+    err = ncmpi_def_var_fill(ncid, varid, 0, NULL); CHECK_ERR
     err = ncmpi_def_var(ncid, "new_var2", NC_FLOAT, 2, dimid, &varid); CHECK_ERR
+    err = ncmpi_def_var_fill(ncid, varid, 0, NULL); CHECK_ERR
     err = ncmpi_enddef(ncid); CHECK_ERR
 
     err = ncmpi_inq_nvars(ncid, &nvars); CHECK_ERR
diff --git a/test/testcases/alignment_test.c b/test/testcases/alignment_test.c
index 9646bbe12..c04a2377c 100644
--- a/test/testcases/alignment_test.c
+++ b/test/testcases/alignment_test.c
@@ -33,8 +33,9 @@
 #define NVARS 8
 #define NX 5
 
-int main(int argc, char** argv) {
-    char filename[256];
+static int tst_mode(char *filename,
+                    int   mode)
+{
     int i, j, rank, nprocs, err, verbose=0, nerrs=0;
     int ncid, cmode, varid[NVARS], dimid[2], *buf;
     char str[32];
@@ -43,25 +44,9 @@ int main(int argc, char** argv) {
     MPI_Offset header_size[2], header_extent[2];
     MPI_Info info=MPI_INFO_NULL;
 
-    MPI_Init(&argc, &argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
 
-    if (argc > 2) {
-        if (!rank) printf("Usage: %s [filename]\n",argv[0]);
-        MPI_Finalize();
-        return 1;
-    }
-    if (argc == 2) snprintf(filename, 256, "%s", argv[1]);
-    else           strcpy(filename, "redef1.nc");
-
-    if (rank == 0) {
-        char *cmd_str = (char*)malloc(strlen(argv[0]) + 256);
-        sprintf(cmd_str, "*** TESTING C   %s for alignment ", basename(argv[0]));
-        printf("%-66s ------ ", cmd_str); fflush(stdout);
-        free(cmd_str);
-    }
-
     /* create a new file for writing ----------------------------------------*/
     cmode = NC_CLOBBER | NC_64BIT_DATA;
     err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, info, &ncid);
@@ -90,6 +75,11 @@ int main(int argc, char** argv) {
     }
     err = ncmpi_enddef(ncid); CHECK_ERR
 
+    if (mode != MODE_COLL) {
+        err = ncmpi_begin_indep_data(ncid);
+        CHECK_ERR
+    }
+
     /* write all variables */
     buf = (int*) malloc(sizeof(int) * NX);
     for (i=0; i<NVARS; i++) {
@@ -98,7 +88,12 @@ int main(int argc, char** argv) {
         if (i%2) {
             start[0] = NX*rank;
             count[0] = NX;
-            err = ncmpi_put_vara_int_all(ncid, varid[i], start, count, buf); CHECK_ERR
+            if (mode == MODE_COLL)
+                err = ncmpi_put_vara_int_all(ncid, varid[i], start, count, buf);
+            else
+                err = ncmpi_put_vara_int(ncid, varid[i], start, count, buf);
+            CHECK_ERR
+
             /* check if user put buffer contents altered */
             for (j=0; j<NX; j++) {
                 if (buf[j] != rank*1000 + i*10 + j) {
@@ -113,10 +108,18 @@ int main(int argc, char** argv) {
         if (i%2 == 0) {
             start[0] = 0; start[1] = NX*rank;
             count[0] = 1; count[1] = NX;
-            err = ncmpi_put_vara_int_all(ncid, varid[i], start, count, buf); CHECK_ERR
+            if (mode == MODE_COLL)
+                err = ncmpi_put_vara_int_all(ncid, varid[i], start, count, buf);
+            else
+                err = ncmpi_put_vara_int(ncid, varid[i], start, count, buf);
+            CHECK_ERR
             for (j=0; j<NX; j++) buf[j] = rank*1000 + 100 + i*10 + j;
             start[0] = 1; /* write 2nd record */
-            err = ncmpi_put_vara_int_all(ncid, varid[i], start, count, buf); CHECK_ERR
+            if (mode == MODE_COLL)
+                err = ncmpi_put_vara_int_all(ncid, varid[i], start, count, buf);
+            else
+                err = ncmpi_put_vara_int(ncid, varid[i], start, count, buf);
+            CHECK_ERR
             /* check if user put buffer contents altered */
             for (j=0; j<NX; j++) {
                 if (buf[j] != rank*1000 + 100 + i*10 + j) {
@@ -196,6 +199,11 @@ int main(int argc, char** argv) {
     }
     err = ncmpi_enddef(ncid); CHECK_ERR
 
+    if (mode != MODE_COLL) {
+        err = ncmpi_begin_indep_data(ncid);
+        CHECK_ERR
+    }
+
     /* get the new header size and extent, also all variables' starting
        file offsets */
     err = ncmpi_inq_header_size(ncid, &header_size[1]); CHECK_ERR
@@ -238,7 +246,11 @@ int main(int argc, char** argv) {
         if (i%2 == 0) {
             start[0] = NX*rank;
             count[0] = NX;
-            err = ncmpi_put_vara_int_all(ncid, new_varid[i], start, count, buf); CHECK_ERR
+            if (mode == MODE_COLL)
+                err = ncmpi_put_vara_int_all(ncid, new_varid[i], start, count, buf);
+            else
+                err = ncmpi_put_vara_int(ncid, new_varid[i], start, count, buf);
+            CHECK_ERR
             sprintf(str,"fixed_var_%d",i);
             /* check if user put buffer contents altered */
             for (j=0; j<NX; j++) {
@@ -254,10 +266,18 @@ int main(int argc, char** argv) {
         if (i%2 == 1) {
             start[0] = 0; start[1] = NX*rank;
             count[0] = 1; count[1] = NX;
-            err = ncmpi_put_vara_int_all(ncid, new_varid[i], start, count, buf); CHECK_ERR
+            if (mode == MODE_COLL)
+                err = ncmpi_put_vara_int_all(ncid, new_varid[i], start, count, buf);
+            else
+                err = ncmpi_put_vara_int(ncid, new_varid[i], start, count, buf);
+            CHECK_ERR
             for (j=0; j<NX; j++) buf[j] = -1 * (100 + i*10 + j);
             start[0] = 1; /* write 2nd record */
-            err = ncmpi_put_vara_int_all(ncid, new_varid[i], start, count, buf); CHECK_ERR
+            if (mode == MODE_COLL)
+                err = ncmpi_put_vara_int_all(ncid, new_varid[i], start, count, buf);
+            else
+                err = ncmpi_put_vara_int(ncid, new_varid[i], start, count, buf);
+            CHECK_ERR
             sprintf(str,"record_var_%d",i);
             /* check if user put buffer contents altered */
             for (j=0; j<NX; j++) {
@@ -278,7 +298,11 @@ int main(int argc, char** argv) {
             start[0] = NX*rank;
             count[0] = NX;
             for (j=0; j<NX; j++) buf[j] = -1;
-            err = ncmpi_get_vara_int_all(ncid, varid[i], start, count, buf); CHECK_ERR
+            if (mode == MODE_COLL)
+                err = ncmpi_get_vara_int_all(ncid, varid[i], start, count, buf);
+            else
+                err = ncmpi_get_vara_int(ncid, varid[i], start, count, buf);
+            CHECK_ERR
             sprintf(str,"fixed_var_%d",i);
             for (j=0; j<NX; j++)
                 if (buf[j] != rank*1000 + i*10 + j) {
@@ -293,7 +317,11 @@ int main(int argc, char** argv) {
         if (i%2 == 0) {
             start[0] = 0; start[1] = NX*rank;
             count[0] = 1; count[1] = NX;
-            err = ncmpi_get_vara_int_all(ncid, varid[i], start, count, buf); CHECK_ERR
+            if (mode == MODE_COLL)
+                err = ncmpi_get_vara_int_all(ncid, varid[i], start, count, buf);
+            else
+                err = ncmpi_get_vara_int(ncid, varid[i], start, count, buf);
+            CHECK_ERR
             for (j=0; j<NX; j++)
                 if (buf[j] != rank*1000+i*10+j) {
                     printf("read error at %d: i=%d buf[j=%d]=%d != %d\n",__LINE__,i,j,buf[j],rank*1000+i*10+j);
@@ -301,7 +329,11 @@ int main(int argc, char** argv) {
                     break;
                 }
             start[0] = 1;
-            err = ncmpi_get_vara_int_all(ncid, varid[i], start, count, buf); CHECK_ERR
+            if (mode == MODE_COLL)
+                err = ncmpi_get_vara_int_all(ncid, varid[i], start, count, buf);
+            else
+                err = ncmpi_get_vara_int(ncid, varid[i], start, count, buf);
+            CHECK_ERR
             sprintf(str,"record_var_%d",i);
             for (j=0; j<NX; j++)
                 if (buf[j] != rank*1000 + 100 + i*10 + j) {
@@ -317,6 +349,39 @@ int main(int argc, char** argv) {
     MPI_Info_free(&info);
     free(buf);
 
+err_out:
+    return nerrs;
+}
+
+int main(int argc, char** argv) {
+    char filename[256];
+    int rank, nprocs, err, nerrs=0;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    if (argc > 2) {
+        if (!rank) printf("Usage: %s [filename]\n",argv[0]);
+        MPI_Finalize();
+        return 1;
+    }
+    if (argc == 2) snprintf(filename, 256, "%s", argv[1]);
+    else           strcpy(filename, "testfile.nc");
+
+    if (rank == 0) {
+        char *cmd_str = (char*)malloc(strlen(argv[0]) + 256);
+        sprintf(cmd_str, "*** TESTING C   %s for alignment ", basename(argv[0]));
+        printf("%-66s ------ ", cmd_str); fflush(stdout);
+        free(cmd_str);
+    }
+
+    nerrs += tst_mode(filename, MODE_COLL);
+    if (nerrs > 0) goto err_out;
+
+    nerrs += tst_mode(filename, MODE_INDEP);
+    if (nerrs > 0) goto err_out;
+
     /* check if PnetCDF freed all internal malloc */
     MPI_Offset malloc_size, sum_size;
     err = ncmpi_inq_malloc_size(&malloc_size);
@@ -328,13 +393,13 @@ int main(int argc, char** argv) {
         if (malloc_size > 0) ncmpi_inq_malloc_list();
     }
 
+err_out:
     MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
     if (rank == 0) {
         if (nerrs) printf(FAIL_STR,nerrs);
         else       printf(PASS_STR);
     }
 
-err_out:
     MPI_Finalize();
     return (nerrs > 0);
 }
diff --git a/test/testcases/buftype_free.c b/test/testcases/buftype_free.c
index 90ffc7645..0d8b2fe7b 100644
--- a/test/testcases/buftype_free.c
+++ b/test/testcases/buftype_free.c
@@ -22,16 +22,18 @@
 
 #define NY 4
 #define NX 4
+#define NVARS 4
+#define NGHOSTS 2
 
 /*----< main() >------------------------------------------------------------*/
 int main(int argc, char **argv) {
 
     char filename[256];
-    int  i, j, err, ncid, varid[4], dimids[2], req[4], st[4], nerrs=0;
-    int  rank, nprocs, buf[4][(NY+4)*(NX+4)];
+    int  i, j, err, ncid, varid[NVARS], dimids[2], req[NVARS], st[NVARS], nerrs=0;
+    int  rank, nprocs, *buf[NVARS];
     int  gsize[2], subsize[2], a_start[2], ghost;
     MPI_Offset start[2], count[2];
-    MPI_Datatype buftype[4];
+    MPI_Datatype buftype[NVARS];
 
     MPI_Init(&argc, &argv);
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
@@ -65,7 +67,14 @@ int main(int argc, char **argv) {
     err = ncmpi_enddef(ncid); CHECK_ERR
 
     /* initialize the contents of the array */
-    for (i=0; i<4; i++) for (j=0; j<(NY+4)*(NX+4); j++) buf[i][j] = rank+10;
+    ghost      = NGHOSTS;
+    gsize[1]   = NX + 2 * ghost;
+    gsize[0]   = NY + 2 * ghost;
+
+    for (i=0; i<NVARS; i++) {
+        buf[i] = (int*) malloc(sizeof(int) * gsize[0] * gsize[1]);
+        for (j=0; j<gsize[0]*gsize[1]; j++) buf[i][j] = rank+10;
+    }
 
     start[0] = NY*rank; start[1] = 0;
     count[0] = NY;      count[1] = NX;
@@ -76,7 +85,7 @@ int main(int argc, char **argv) {
     err = ncmpi_put_vara_int_all(ncid, varid[3], start, count, buf[3]); CHECK_ERR
 
     /* check if user write buffer contents altered */
-    for (i=0; i<4; i++) {
+    for (i=0; i<NVARS; i++) {
         for (j=0; j<(NY+4)*(NX+4); j++) {
             if (buf[i][j] != rank+10) {
                 printf("Error at line %d in %s: user put buffer[%d][%d] altered from %d to %d\n",
@@ -87,15 +96,12 @@ int main(int argc, char **argv) {
     }
 
     /* define an MPI datatype using MPI_Type_create_subarray() */
-    ghost      = 2;
-    gsize[1]   = NX + 2 * ghost;
-    gsize[0]   = NY + 2 * ghost;
     subsize[1] = NX;
     subsize[0] = NY;
     a_start[1] = ghost;
     a_start[0] = ghost;
 
-    for (i=0; i<4; i++) {
+    for (i=0; i<NVARS; i++) {
         req[i] = NC_REQ_NULL;
         st[i]  = NC_NOERR;
         MPI_Type_create_subarray(2, gsize, subsize, a_start, MPI_ORDER_C, MPI_INT, &buftype[i]);
@@ -105,8 +111,8 @@ int main(int argc, char **argv) {
         MPI_Type_free(&buftype[i]);
     }
 
-    err = ncmpi_wait_all(ncid, 4, req, st); CHECK_ERR
-    for (i=0; i<4; i++) {
+    err = ncmpi_wait_all(ncid, NVARS, req, st); CHECK_ERR
+    for (i=0; i<NVARS; i++) {
         if (st[i] != NC_NOERR) {
             err = st[i];
             CHECK_ERR
@@ -115,6 +121,9 @@ int main(int argc, char **argv) {
 
     err = ncmpi_close(ncid); CHECK_ERR
 
+    for (i=0; i<NVARS; i++)
+        free(buf[i]);
+
     /* check if PnetCDF freed all internal malloc */
     MPI_Offset malloc_size, sum_size;
     err = ncmpi_inq_malloc_size(&malloc_size);
diff --git a/test/testcases/collective_error.c b/test/testcases/collective_error.c
index 6a22088a0..bcccea2ae 100644
--- a/test/testcases/collective_error.c
+++ b/test/testcases/collective_error.c
@@ -34,7 +34,7 @@
 static
 int test_collective_error(char *filename, int safe_mode, int cmode)
 {
-    int rank, nproc, ncid, err, nerrs=0, varid, dimids[1], req, status;
+    int rank, nproc, ncid, err, nerrs=0, varid, dimids[1], req, status, exp;
     double buf[2];
     MPI_Offset start[1], count[1];
     MPI_Comm comm=MPI_COMM_WORLD;
@@ -44,10 +44,10 @@ int test_collective_error(char *filename, int safe_mode, int cmode)
 
     /* Create a 2 element vector of doubles */
     cmode |= NC_CLOBBER;
-    err = ncmpi_create(comm, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR
-    err = ncmpi_def_dim(ncid, "dim", 2, &dimids[0]); CHECK_ERR
-    err = ncmpi_def_var(ncid, "var", NC_DOUBLE, 1, dimids, &varid); CHECK_ERR
-    err = ncmpi_enddef(ncid); CHECK_ERR
+    err = ncmpi_create(comm, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR_ALL
+    err = ncmpi_def_dim(ncid, "dim", 2, &dimids[0]); CHECK_ERR_ALL
+    err = ncmpi_def_var(ncid, "var", NC_DOUBLE, 1, dimids, &varid); CHECK_ERR_ALL
+    err = ncmpi_enddef(ncid); CHECK_ERR_ALL
 
     if (rank == 0) {
         start[0] = 0;
@@ -70,8 +70,9 @@ int test_collective_error(char *filename, int safe_mode, int cmode)
 
     err = ncmpi_put_vara_all(ncid, varid, start, count, buf, count[0],
                              MPI_DOUBLE);
-    if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS)
-    else                                       EXP_ERR(NC_NOERR)
+    if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS;
+    else                                       exp = NC_NOERR;
+    CHECK_EXP_ERR_ALL(exp)
 
     /* check if user put buffer contents altered */
     if (buf[0] != 1.0) {
@@ -86,8 +87,9 @@ int test_collective_error(char *filename, int safe_mode, int cmode)
     }
 
     err = ncmpi_put_vara_double_all(ncid, varid, start, count, buf);
-    if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS)
-    else                                       EXP_ERR(NC_NOERR)
+    if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS;
+    else                                       exp = NC_NOERR;
+    CHECK_EXP_ERR_ALL(exp)
 
     /* check if user put buffer contents altered */
     if (buf[0] != 1.0) {
@@ -103,12 +105,10 @@ int test_collective_error(char *filename, int safe_mode, int cmode)
 
     if (!(cmode & NC_NETCDF4)) {
         err = ncmpi_iput_vara_double(ncid, varid, start, count, buf, &req);
-        if (rank == 1)
-            EXP_ERR(NC_EINVALCOORDS)
-        else
-            EXP_ERR(NC_NOERR)
+        exp = (rank == 1) ? NC_EINVALCOORDS : NC_NOERR;
+        EXP_ERR(exp)
 
-        err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR
+        err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR_ALL
 
         /* check if user put buffer contents altered */
         if (buf[0] != 1.0) {
@@ -125,25 +125,26 @@ int test_collective_error(char *filename, int safe_mode, int cmode)
 
     err = ncmpi_get_vara_all(ncid, varid, start, count,
 			     buf, count[0], MPI_DOUBLE);
-    if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS)
-    else                                       EXP_ERR(NC_NOERR)
+    if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS;
+    else                                       exp = NC_NOERR;
+    CHECK_EXP_ERR_ALL(exp)
 
     err = ncmpi_get_vara_double_all(ncid, varid, start, count, buf);
-    if ((safe_mode && nproc > 1) || rank == 1) EXP_ERR(NC_EINVALCOORDS)
-    else                                       EXP_ERR(NC_NOERR)
+    if ((safe_mode && nproc > 1) || rank == 1) exp = NC_EINVALCOORDS;
+    else                                       exp = NC_NOERR;
+    CHECK_EXP_ERR_ALL(exp)
 
     if (!(cmode & NC_NETCDF4)) {
         err = ncmpi_iget_vara_double(ncid, varid, start, count, buf, &req);
-        if (rank == 1)
-            EXP_ERR(NC_EINVALCOORDS)
-        else
-            EXP_ERR(NC_NOERR)
+        exp = (rank == 1) ? NC_EINVALCOORDS : NC_NOERR;
+        EXP_ERR(exp)
 
-        err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR
+        err = ncmpi_wait_all(ncid, 1, &req, &status); CHECK_ERR_ALL
     }
 
-    err = ncmpi_close(ncid); CHECK_ERR
+    err = ncmpi_close(ncid); CHECK_ERR_ALL
 
+fn_exit:
     return nerrs;
 }
 
diff --git a/test/testcases/error_precedence.m4 b/test/testcases/error_precedence.m4
index 6b26b0c38..4732a9d08 100644
--- a/test/testcases/error_precedence.m4
+++ b/test/testcases/error_precedence.m4
@@ -196,6 +196,9 @@ test_format_nc$1(char *filename)
     foreach(`itype',(text, TYPE_LIST),`_CAT(`
     err=API(def_var)(ncid,"var_'itype`",NC_TYPE(itype),2,dimids,&vid_',itype`); CHECK_ERR')')
 
+    foreach(`itype',(text, TYPE_LIST),`_CAT(`
+    err=API(def_var_fill)(ncid, vid_'itype`, 0, NULL); CHECK_ERR')')
+
     /* For put attribute APIs, the error precedence is the following:
      *    NC_EBADID, NC_EPERM, NC_ENOTVAR, NC_EBADNAME, NC_EBADTYPE, NC_ECHAR,
      *    NC_EINVAL, NC_ENOTINDEFINE, NC_ERANGE
diff --git a/test/testcases/mix_collectives.c b/test/testcases/mix_collectives.c
index 583a29599..649624242 100644
--- a/test/testcases/mix_collectives.c
+++ b/test/testcases/mix_collectives.c
@@ -181,7 +181,7 @@ int main(int argc, char **argv)
                 __LINE__,__FILE__,i,g_buf[i],check_buf[i]);
                 nerrs++;
                 free(check_buf);
-                goto err_out;
+                goto syn_err;
             }
         }
     }
@@ -207,7 +207,7 @@ int main(int argc, char **argv)
                 printf("Error at line %d in %s: expecting var[%d]=%d but got %d\n",
                 __LINE__,__FILE__,i, j*4+i + rank*100, buf[j][i]);
                 nerrs++;
-                goto err_out;
+                goto syn_err;
             }
         }
     }
@@ -222,7 +222,7 @@ int main(int argc, char **argv)
                 printf("Error at line %d in %s: expecting var[%d]=%d but got %d\n",
                 __LINE__,__FILE__,j, j+rank*100, *val);
                 nerrs++;
-                goto err_out;
+                goto syn_err;
             }
             val++;
         }
@@ -238,7 +238,7 @@ int main(int argc, char **argv)
                 printf("Error at line %d in %s: expecting var[%d][%d]=%d but got %d\n",
                 __LINE__,__FILE__,j,i, j*4+i + rank*100, buf[j][i]);
                 nerrs++;
-                goto err_out;
+                goto syn_err;
             }
         }
     }
@@ -251,11 +251,15 @@ int main(int argc, char **argv)
                 printf("Error at line %d in %s: expecting var[%d][%d]=%d but got %d\n",
                 __LINE__,__FILE__,j,i, -1, buf[j][i]);
                 nerrs++;
-                goto err_out;
+                goto syn_err;
             }
         }
     }
 
+syn_err:
+    MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+    if (nerrs) goto err_out;
+
     /* test when different processes call put APIs with different varid */
     err = ncmpi_redef(ncid); CHECK_ERR
     err = ncmpi_def_var(ncid, "scalar0", NC_INT, 0, NULL, &varids[0]); CHECK_ERR
diff --git a/test/testcases/ncmpi_vars_null_stride.c b/test/testcases/ncmpi_vars_null_stride.c
index 0cfd81aac..40884d966 100644
--- a/test/testcases/ncmpi_vars_null_stride.c
+++ b/test/testcases/ncmpi_vars_null_stride.c
@@ -30,11 +30,13 @@
 #define NY 4
 #define NX 2
 
+static int verbose;
+
 static int
 tst_fmt(char *filename, int cmode)
 {
     int err, nerrs=0, ncid, dimid[NDIMS], varid[5], ndims=NDIMS;
-    int i, j, k, nprocs, rank, req, *buf;
+    int i, j, k, nprocs, rank, req, *buf=NULL;
     MPI_Offset start[NDIMS] = {0};
     MPI_Offset count[NDIMS] = {0};
     MPI_Offset stride[NDIMS] = {0};
@@ -54,12 +56,13 @@ tst_fmt(char *filename, int cmode)
     err = ncmpi_def_var(ncid, "v4", NC_INT, ndims, dimid, &varid[4]); CHECK_ERR
     err = ncmpi_enddef(ncid); CHECK_ERR
 
+    buf = (int*) malloc(sizeof(int) * NY * NX);
+    for (i=0; i<NY*NX; i++) buf[i] = rank+10;
+
     start[0] = 0;
     start[1] = rank*NX;
     count[0] = NY;
     count[1] = NX;
-    buf = (int*) malloc(sizeof(int) * NY * NX);
-    for (i=0; i<NY*NX; i++) buf[i] = rank+10;
 
     err = ncmpi_put_vara_int_all(ncid, varid[0], start, count, buf); CHECK_ERR
     CHECK_PUT_BUF
@@ -109,6 +112,7 @@ tst_fmt(char *filename, int cmode)
     }
     CHECK_PUT_BUF
     free(buf);
+    buf = NULL;
 
     if (!(cmode & NC_NETCDF4)) {
         err = ncmpi_buffer_detach(ncid); CHECK_ERR
@@ -232,7 +236,7 @@ tst_fmt(char *filename, int cmode)
 
 err_out:
     err = ncmpi_close(ncid); CHECK_ERR
-    free(buf);
+    if (buf != NULL) free(buf);
 
     return nerrs;
 }
@@ -260,24 +264,33 @@ int main(int argc, char **argv)
         free(cmd_str);
     }
 
+    verbose = 0;
+
     /* check whether burst buffering is enabled */
     if (inq_env_hint("nc_burst_buf", &hint_value)) {
         if (strcasecmp(hint_value, "enable") == 0) bb_enabled = 1;
         free(hint_value);
     }
 
+    if (verbose && rank == 0) printf("testing CDF-1\n");
     nerrs += tst_fmt(filename, 0);
     if (nerrs) goto fn_exit;
+
+    if (verbose && rank == 0) printf("testing CDF-2\n");
     nerrs += tst_fmt(filename, NC_64BIT_OFFSET);
     if (nerrs) goto fn_exit;
+
     if (!bb_enabled) {
 #ifdef ENABLE_NETCDF4
+        if (verbose && rank == 0) printf("testing NC_NETCDF4\n");
         nerrs += tst_fmt(filename, NC_NETCDF4);
         if (nerrs) goto fn_exit;
+        if (verbose && rank == 0) printf("testing NC_NETCDF4 NC_CLASSIC_MODEL\n");
         nerrs += tst_fmt(filename, NC_NETCDF4 | NC_CLASSIC_MODEL);
         if (nerrs) goto fn_exit;
 #endif
     }
+    if (verbose && rank == 0) printf("testing CDF-5\n");
     nerrs += tst_fmt(filename, NC_64BIT_DATA);
     if (nerrs) goto fn_exit;
 
diff --git a/test/testcases/nonblocking.c b/test/testcases/nonblocking.c
index 169d7f45b..338d646b8 100644
--- a/test/testcases/nonblocking.c
+++ b/test/testcases/nonblocking.c
@@ -43,44 +43,20 @@
 #define NY 4
 #define NX 5
 
-/*----< main() >------------------------------------------------------------*/
-int main(int argc, char **argv) {
-
-    char       filename[256];
-    int        i, j, err, ncid, varid, dimids[2], req[2], st[2], nerrs=0;
-    int        rank, nprocs, buf[NY+1][NX];
+static
+int tst_mode(const char *filename,
+             int         mode,
+             MPI_Info    info)
+{
+    int i, j, err, ncid, varid, dimids[2], req[2], st[2], nerrs=0;
+    int rank, nprocs, buf[NY+1][NX];
     MPI_Offset start[2], count[2];
-    MPI_Info   info;
 
-    MPI_Init(&argc, &argv);
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
-    if (argc > 2) {
-        if (!rank) printf("Usage: %s [filename]\n",argv[0]);
-        MPI_Finalize();
-        return 1;
-    }
-    if (argc == 2) snprintf(filename, 256, "%s", argv[1]);
-    else           strcpy(filename, "testfile.nc");
-    MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD);
-
-    if (rank == 0) {
-        char *cmd_str = (char*)malloc(strlen(argv[0]) + 256);
-        sprintf(cmd_str, "*** TESTING C   %s for using ncmpi_iput_vara_int() ", basename(argv[0]));
-        printf("%-66s ------ ", cmd_str); fflush(stdout);
-        free(cmd_str);
-    }
-
-    MPI_Info_create(&info);
-    /* When using PVFS2, unexpected buffer value error message might occur.
-     * This is due to  a possible bug in ADIOI_PVFS2_OldWriteStrided() when
-     * filetype is contiguous and buftype is non-contiguous.
-     * Fix: Add ROMIO hint to force ADIO driever to use POSIX I/O */
-    /* MPI_Info_set(info, "romio_pvfs2_posix_write", "enable"); */
-
-    err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid); CHECK_ERR
-    MPI_Info_free(&info);
+    err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, info, &ncid);
+    CHECK_FATAL_ERR
 
     /* define a 2D array */
     err = ncmpi_def_dim(ncid, "Y", NC_UNLIMITED, &dimids[0]); CHECK_ERR
@@ -88,6 +64,10 @@ int main(int argc, char **argv) {
     err = ncmpi_def_var(ncid, "var", NC_INT, 2, dimids, &varid); CHECK_ERR
     err = ncmpi_enddef(ncid); CHECK_ERR
 
+    if (mode == MODE_INDEP) {
+        err = ncmpi_sync(ncid); CHECK_ERR
+    }
+
     /* initialize the contents of the array */
     for (j=0; j<NY+1; j++) for (i=0; i<NX; i++) buf[j][i] = j;
 
@@ -95,22 +75,37 @@ int main(int argc, char **argv) {
     count[0] = 1;      count[1] = NX;
 
     /* call nonblocking API */
-    err = ncmpi_iput_vara_int(ncid, varid, start, count, buf[1], &req[0]); CHECK_ERR
+    err = ncmpi_iput_vara_int(ncid, varid, start, count, buf[1], &req[0]);
+    CHECK_ERR
 
     start[0] += 1;
-    err = ncmpi_iput_vara_int(ncid, varid, start, count, buf[0], &req[1]); CHECK_ERR
+    err = ncmpi_iput_vara_int(ncid, varid, start, count, buf[0], &req[1]);
+    CHECK_ERR
 
     st[0] = st[1] = NC_NOERR;
-    err = ncmpi_wait_all(ncid, 2, req, st); CHECK_ERR
+
+    if (mode == MODE_COLL) {
+        err = ncmpi_wait_all(ncid, 2, req, st);
+        CHECK_ERR
+    }
+    else {
+        err = ncmpi_begin_indep_data(ncid);
+        CHECK_ERR
+        err = ncmpi_wait(ncid, 2, req, st);
+        CHECK_ERR
+    }
     err = st[0]; CHECK_ERR
     err = st[1]; CHECK_ERR
 
     /* check if the contents of buf are altered */
     for (j=0; j<NY; j++)
         for (i=0; i<NX; i++)
-            if (buf[j][i] != j)
+            if (buf[j][i] != j) {
                 printf("Error at line %d in %s: buf[%d][%d]=%d != %d\n",
                 __LINE__,__FILE__,j,i,buf[j][i],j);
+                nerrs++;
+                goto fn_exit;
+            }
 
     /* check if root process can write to file header in data mode */
     err = ncmpi_rename_var(ncid, varid, "VAR"); CHECK_ERR
@@ -119,17 +114,26 @@ int main(int argc, char **argv) {
 
     /* open the same file and read back for validate */
     err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL,
-                     &ncid); CHECK_ERR
+                     &ncid); CHECK_FATAL_ERR
 
     err = ncmpi_inq_varid(ncid, "VAR", &varid); CHECK_ERR
 
+    if (mode == MODE_INDEP) {
+        err = ncmpi_begin_indep_data(ncid);
+        CHECK_ERR_ALL
+    }
+
     /* initialize the contents of the array to a different value */
     for (j=0; j<NY; j++) for (i=0; i<NX; i++) buf[j][i] = -1;
 
     /* read back variable */
     start[0] = 2*rank; start[1] = 0;
     count[0] = 2;      count[1] = NX;
-    err = ncmpi_get_vara_int_all(ncid, varid, start, count, buf[0]); CHECK_ERR
+    if (mode == MODE_COLL)
+        err = ncmpi_get_vara_int_all(ncid, varid, start, count, buf[0]);
+    else
+        err = ncmpi_get_vara_int(ncid, varid, start, count, buf[0]);
+    CHECK_ERR
 
     err = ncmpi_close(ncid); CHECK_ERR
 
@@ -138,12 +142,65 @@ int main(int argc, char **argv) {
         int val = (j == 0) ? 1 : 0;
         for (i=0; i<NX; i++)
             if (buf[j][i] != val) {
-                printf("Unexpected read buf[%d][%d]=%d, should be %d\n",
+                printf("Error: unexpected read buf[%d][%d]=%d, should be %d\n",
                        j,i,buf[j][i],val);
                 nerrs++;
+                goto fn_exit;
             }
     }
 
+fn_exit:
+    return nerrs;
+}
+
+/*----< main() >------------------------------------------------------------*/
+int main(int argc, char **argv) {
+
+    char filename[256];
+    int err, nerrs=0, rank;
+    MPI_Info info;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+
+    if (argc > 2) {
+        if (!rank) printf("Usage: %s [filename]\n",argv[0]);
+        MPI_Finalize();
+        return 1;
+    }
+    if (argc == 2) snprintf(filename, 256, "%s", argv[1]);
+    else           strcpy(filename, "testfile.nc");
+    MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD);
+
+    if (rank == 0) {
+        char *cmd_str = (char*)malloc(strlen(argv[0]) + 256);
+        sprintf(cmd_str, "*** TESTING C   %s for using ncmpi_iput_vara_int() ", basename(argv[0]));
+        printf("%-66s ------ ", cmd_str); fflush(stdout);
+        free(cmd_str);
+    }
+
+    MPI_Info_create(&info);
+    /* When using PVFS2, unexpected buffer value error message might occur.
+     * This is due to  a possible bug in ADIOI_PVFS2_OldWriteStrided() when
+     * filetype is contiguous and buftype is non-contiguous.
+     * Fix: Add ROMIO hint to force MPI-IO to use POSIX I/O driver */
+    /* MPI_Info_set(info, "romio_pvfs2_posix_write", "enable"); */
+
+    /* disable internal buffering for small non-blocking APIs */
+    MPI_Info_set(info, "nc_ibuf_size", "0");
+
+    nerrs = tst_mode(filename, MODE_COLL,  MPI_INFO_NULL);
+    if (nerrs > 0) goto err_out;
+
+    nerrs = tst_mode(filename, MODE_INDEP, MPI_INFO_NULL);
+    if (nerrs > 0) goto err_out;
+
+    nerrs = tst_mode(filename, MODE_COLL, info);
+    if (nerrs > 0) goto err_out;
+
+    nerrs = tst_mode(filename, MODE_INDEP, info);
+    if (nerrs > 0) goto err_out;
+
     /* check if PnetCDF freed all internal malloc */
     MPI_Offset malloc_size, sum_size;
     err = ncmpi_inq_malloc_size(&malloc_size);
@@ -155,6 +212,9 @@ int main(int argc, char **argv) {
         if (malloc_size > 0) ncmpi_inq_malloc_list();
     }
 
+err_out:
+    MPI_Info_free(&info);
+
     MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
     if (rank == 0) {
         if (nerrs) printf(FAIL_STR,nerrs);
diff --git a/test/testcases/null_args.m4 b/test/testcases/null_args.m4
index 65484f380..4fed908bc 100644
--- a/test/testcases/null_args.m4
+++ b/test/testcases/null_args.m4
@@ -295,11 +295,18 @@ test_format_nc$1(char *filename)
     /* define variables */dnl
     foreach(`itype',(text, TYPE_LIST),`_CAT(`
     err = ncmpi_def_var(ncid,"var_'itype`",NC_TYPE(itype),2,dimid,&vid_',itype`);
-    EXP_ERR_MSG(NC_NOERR,"def_var")')')
+    EXP_ERR_MSG(NC_NOERR,"def_var")
+    err = ncmpi_def_var_fill(ncid, vid_'itype`, 0, NULL);
+    EXP_ERR_MSG(NC_NOERR,"def_var_fill")')')
 
     err = ncmpi_enddef(ncid);
     EXP_ERR_MSG(NC_NOERR,"enddef")
 
+    /* fill the 1st record of all variables */dnl
+    foreach(`itype',(text, TYPE_LIST),`_CAT(`
+    err = ncmpi_fill_var_rec(ncid, vid_'itype`, 0);
+    EXP_ERR_MSG(NC_NOERR,"fill_var_rec")')')
+
      start[0] =  start[1] = 0;
      count[0] =  count[1] = 1;
     stride[0] = stride[1] = 1;
diff --git a/test/testcases/parallel_run.sh b/test/testcases/parallel_run.sh
index 0abc8b12f..4418f27e0 100755
--- a/test/testcases/parallel_run.sh
+++ b/test/testcases/parallel_run.sh
@@ -21,7 +21,7 @@ OUTDIR=`echo "$TESTOUTDIR" | cut -d: -f2-`
 NTHREADS=`expr $1 \* 6 - 1`
 
 # echo "PNETCDF_DEBUG = ${PNETCDF_DEBUG}"
-if test ${PNETCDF_DEBUG} = 1 ; then
+if test "x${PNETCDF_DEBUG}" = x1 ; then
    safe_modes="0 1"
 else
    safe_modes="0"
@@ -32,17 +32,53 @@ unset PNETCDF_HINTS
 
 for i in ${check_PROGRAMS} ; do
     for j in ${safe_modes} ; do
-    for intra_aggr in 0 1 ; do
         if test "$j" = 1 ; then # test only in safe mode
-           export PNETCDF_HINTS="romio_no_indep_rw=true"
+           SAFE_HINTS="romio_no_indep_rw=true"
+        else
+           SAFE_HINTS="romio_no_indep_rw=false"
+        fi
+        OUT_PREFIX="${TESTOUTDIR}/$i"
+
+    for mpiio_mode in 0 1 ; do
+        if test "$mpiio_mode" = 1 ; then
+           USEMPIO_HINTS="nc_pncio=disable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.mpio"
         else
-           export PNETCDF_HINTS=
+           USEMPIO_HINTS="nc_pncio=enable"
+           DRIVER_OUT_FILE="${OUT_PREFIX}.pncio"
         fi
+    for intra_aggr in 0 1 ; do
         if test "$intra_aggr" = 1 ; then
-           export PNETCDF_HINTS="${PNETCDF_HINTS};nc_num_aggrs_per_node=2"
+           INA_HINTS="nc_num_aggrs_per_node=2"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}.ina"
+        else
+           INA_HINTS="nc_num_aggrs_per_node=0"
+           INA_OUT_FILE="${DRIVER_OUT_FILE}"
+        fi
+
+        OUT_FILE=$INA_OUT_FILE
+
+        if [[ "$i" == *"vard"* ]] ; then
+           if test "x$mpiio_mode" == x0 || test "x$intra_aggr" == x1 ; then
+              # vard APIs are not supported when using PNCIO
+              continue
+           fi
+        fi
+
+        PNETCDF_HINTS=
+        if test "x$SAFE_HINTS" != x ; then
+           PNETCDF_HINTS="$SAFE_HINTS"
+        fi
+        if test "x$USEMPIO_HINTS" != x ; then
+           PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
         fi
+        if test "x$INA_HINTS" != x ; then
+           PNETCDF_HINTS="$INA_HINTS;$PNETCDF_HINTS"
+        fi
+
+        export PNETCDF_HINTS="$PNETCDF_HINTS"
         export PNETCDF_SAFE_MODE=$j
-        # echo "set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+        # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
 
         if test "$i" = tst_version ; then
            ${MPIRUN} ./tst_version
@@ -51,25 +87,25 @@ for i in ${check_PROGRAMS} ; do
 
         if test "$i" = tst_pthread ; then
            # each MPI process created 6 threads
-           ${MPIRUN} ./tst_pthread ${TESTOUTDIR}/tst_pthread.nc
+           ${MPIRUN} ./tst_pthread ${OUT_FILE}.nc
            for k in `seq 0 ${NTHREADS}` ; do
-               ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/tst_pthread.nc.$k
+               ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc.$k
                rm -f ${OUTDIR}/tst_pthread.nc.$k
            done
            continue
         fi
 
-        ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc
+        ${MPIRUN} ./$i ${OUT_FILE}.nc
 
         # put_all_kinds and iput_all_kinds output 3 files
         if test "$i" = put_all_kinds -o "$i" = iput_all_kinds ; then
            for k in 1 2 5 ; do
-               # echo "--- validating file ${TESTOUTDIR}/$i.nc$k"
-               ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc$k
+               # echo "--- validating file ${OUT_FILE}.nc$k"
+               ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc$k
            done
         else
-           # echo "--- validating file ${TESTOUTDIR}/$i.nc"
-           ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.nc
+           # echo "--- validating file ${OUT_FILE}.nc"
+           ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.nc
         fi
         # echo ""
 
@@ -77,21 +113,21 @@ for i in ${check_PROGRAMS} ; do
            # echo "---- test burst buffering feature"
            saved_PNETCDF_HINTS=${PNETCDF_HINTS}
            export PNETCDF_HINTS="${PNETCDF_HINTS};nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
-           ${MPIRUN} ./$i ${TESTOUTDIR}/$i.bb.nc
+           ${MPIRUN} ./$i ${OUT_FILE}.bb.nc
            export PNETCDF_HINTS=${saved_PNETCDF_HINTS}
 
            # put_all_kinds and iput_all_kinds output 3 files
            if test "$i" = put_all_kinds -o "$i" = iput_all_kinds ; then
               for k in 1 2 5 ; do
-                  # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc$k"
-                  ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc$k
+                  # echo "--- validating file ${OUT_FILE}.bb.nc$k"
+                  ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc$k
                   # echo "--- ncmpidiff $i.nc$k $i.bb.nc$k ---"
-                  ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc$k ${TESTOUTDIR}/$i.bb.nc$k
+                  ${MPIRUN} ${NCMPIDIFF} -q ${OUT_FILE}.nc$k ${OUT_FILE}.bb.nc$k
               done
               continue
            else
-              # echo "--- validating file ${TESTOUTDIR}/$i.bb.nc"
-              ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$i.bb.nc
+              # echo "--- validating file ${OUT_FILE}.bb.nc"
+              ${TESTSEQRUN} ${VALIDATOR} -q ${OUT_FILE}.bb.nc
            fi
 
            # skip ncmpidiff for large file
@@ -100,17 +136,57 @@ for i in ${check_PROGRAMS} ; do
            fi
 
            # echo "--- ncmpidiff $i.nc $i.bb.nc ---"
-           ${MPIRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/$i.nc ${TESTOUTDIR}/$i.bb.nc
+           ${MPIRUN} ${NCMPIDIFF} -q ${OUT_FILE}.nc ${OUT_FILE}.bb.nc
         fi
 
         if test "x${ENABLE_NETCDF4}" = x1 ; then
            # echo "test netCDF-4 feature"
-           ${MPIRUN} ./$i ${TESTOUTDIR}/$i.nc4 4
+           ${MPIRUN} ./$i ${OUT_FILE}.nc4 4
            # Validator does not support nc4
         fi
-    done
-    done
-    rm -f ${OUTDIR}/$i.nc*
-    rm -f ${OUTDIR}/$i.bb.nc*
-done
+    done # intra_aggr
+    done # mpiio_mode
+
+    if test "$i" = tst_version ; then
+       # this program creates no output file
+       continue
+    fi
+    if [[ "$i" == *"vard"* ]] ; then
+       continue
+    fi
+
+    DIFF_OPT="-q"
+    if test "$i" = last_large_var ; then
+       DIFF_OPT+=" -h"
+    fi
+    if test "$i" = put_all_kinds || test "$i" = iput_all_kinds ; then
+       for j in 1 2 5; do
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.mpio.ina.nc$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.mpio.ina.nc$j
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.nc$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.nc$j
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.ina.nc$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc$j $OUT_PREFIX.pncio.ina.nc$j
+       done
+    elif test "$i" = tst_pthread ; then
+       for j in `seq 0 ${NTHREADS}` ; do
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.mpio.ina.nc.$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.mpio.ina.nc.$j
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.nc.$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.nc.$j
+          # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.ina.nc.$j ---"
+          $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc.$j $OUT_PREFIX.pncio.ina.nc.$j
+       done
+    else
+       # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.mpio.ina.nc
+       # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.nc
+       # echo "--- ncmpidiff $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc ---"
+       $MPIRUN $NCMPIDIFF $DIFF_OPT $OUT_PREFIX.mpio.nc $OUT_PREFIX.pncio.ina.nc
+    fi
+
+    done # safe_modes
+    rm -f ${OUTDIR}/$i*nc*
+done # check_PROGRAMS
 
diff --git a/test/testcases/seq_runs.sh b/test/testcases/seq_runs.sh
index 270536cc6..32a01bfcb 100755
--- a/test/testcases/seq_runs.sh
+++ b/test/testcases/seq_runs.sh
@@ -12,69 +12,96 @@ VALIDATOR=../../src/utils/ncvalidator/ncvalidator
 # prevent user environment setting of PNETCDF_HINTS to interfere
 unset PNETCDF_HINTS
 
-${TESTSEQRUN} ./tst_version
-
-${TESTSEQRUN} ./put_all_kinds ${TESTOUTDIR}/put_all_kinds.nc
-${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc1
-${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc2
-${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc5
-
-${TESTSEQRUN} ./iput_all_kinds ${TESTOUTDIR}/iput_all_kinds.nc
-${TESTSEQRUN} ${VALIDATOR} -q  ${TESTOUTDIR}/iput_all_kinds.nc1
-${TESTSEQRUN} ${VALIDATOR} -q  ${TESTOUTDIR}/iput_all_kinds.nc2
-${TESTSEQRUN} ${VALIDATOR} -q  ${TESTOUTDIR}/iput_all_kinds.nc5
-
-NCMPIGEN=../../src/utils/ncmpigen/ncmpigen
-NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff
-
-# remove the file system type prefix name if there is any.
-OUT_PATH=`echo "$TESTOUTDIR" | cut -d: -f2-`
-
-rm -f ${OUT_PATH}/testfile.nc ${OUT_PATH}/redef1.nc
-${TESTSEQRUN} ${NCMPIGEN} -v 5 -o ${TESTOUTDIR}/redef1.nc ${srcdir}/redef-good.ncdump
-${TESTSEQRUN} ./redef1 ${TESTOUTDIR}/testfile.nc
-${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/testfile.nc ${TESTOUTDIR}/redef1.nc
-# diff -q ${OUT_PATH}/testfile.nc ${OUT_PATH}/redef1.nc
-
-${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/testfile.nc
-rm -f ${OUT_PATH}/redef1.nc
-rm -f ${OUT_PATH}/testfile.nc
-
-# echo ""
-
-if test "x${ENABLE_BURST_BUFFER}" = x1 ; then
-   echo ""
-   echo "---- testing burst buffering"
-
-   # Run using burst buffer driver
-   export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
-   ${TESTSEQRUN} ./put_all_kinds ${TESTOUTDIR}/put_all_kinds.bb.nc
-   ${TESTSEQRUN} ./iput_all_kinds ${TESTOUTDIR}/iput_all_kinds.bb.nc
-   unset PNETCDF_HINTS
-
-   # Compare
-   for i in 1 2 5 ; do
-       ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/put_all_kinds.nc$i ${TESTOUTDIR}/put_all_kinds.bb.nc$i
-       ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/iput_all_kinds.nc$i ${TESTOUTDIR}/iput_all_kinds.bb.nc$i
-   done
-fi
-rm -f ${OUT_PATH}/put_all_kinds.nc*
-rm -f ${OUT_PATH}/put_all_kinds.bb.nc*
-rm -f ${OUT_PATH}/iput_all_kinds.nc*
-rm -f ${OUT_PATH}/iput_all_kinds.bb.nc*
-
-# echo ""
-
-if test "${ENABLE_THREAD_SAFE}" = 1 ; then
-   # echo "---- testing thread safety"
-   for j in 0 1 ; do
-       export PNETCDF_SAFE_MODE=$j
-       # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
-
-       ${TESTSEQRUN} ./tst_pthread ${TESTOUTDIR}/tst_pthread.nc
-       for i in 0 1 2 3 4 5 ; do
-           ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/tst_pthread.nc.$i
-           rm -f ${OUT_PATH}/tst_pthread.nc.$i
+for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
+    export PNETCDF_SAFE_MODE=$j
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
+    ${TESTSEQRUN} ./tst_version
+
+    ${TESTSEQRUN} ./put_all_kinds ${TESTOUTDIR}/put_all_kinds.nc
+    ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc1
+    ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc2
+    ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/put_all_kinds.nc5
+
+    ${TESTSEQRUN} ./iput_all_kinds ${TESTOUTDIR}/iput_all_kinds.nc
+    ${TESTSEQRUN} ${VALIDATOR} -q  ${TESTOUTDIR}/iput_all_kinds.nc1
+    ${TESTSEQRUN} ${VALIDATOR} -q  ${TESTOUTDIR}/iput_all_kinds.nc2
+    ${TESTSEQRUN} ${VALIDATOR} -q  ${TESTOUTDIR}/iput_all_kinds.nc5
+
+    NCMPIGEN=../../src/utils/ncmpigen/ncmpigen
+    NCMPIDIFF=../../src/utils/ncmpidiff/ncmpidiff
+
+    # remove the file system type prefix name if there is any.
+    OUT_PATH=`echo "$TESTOUTDIR" | cut -d: -f2-`
+
+    rm -f ${OUT_PATH}/testfile.nc ${OUT_PATH}/redef1.nc
+    ${TESTSEQRUN} ${NCMPIGEN} -v 5 -o ${TESTOUTDIR}/redef1.nc ${srcdir}/redef-good.ncdump
+    ${TESTSEQRUN} ./redef1 ${TESTOUTDIR}/testfile.nc
+    ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/testfile.nc ${TESTOUTDIR}/redef1.nc
+    # diff -q ${OUT_PATH}/testfile.nc ${OUT_PATH}/redef1.nc
+
+    ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/testfile.nc
+    rm -f ${OUT_PATH}/redef1.nc
+    rm -f ${OUT_PATH}/testfile.nc
+
+    # echo ""
+
+    if test "x${ENABLE_BURST_BUFFER}" = x1 ; then
+       echo ""
+       echo "---- testing burst buffering"
+
+       # Run using burst buffer driver
+       export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+       ${TESTSEQRUN} ./put_all_kinds ${TESTOUTDIR}/put_all_kinds.bb.nc
+       ${TESTSEQRUN} ./iput_all_kinds ${TESTOUTDIR}/iput_all_kinds.bb.nc
+       unset PNETCDF_HINTS
+
+       # Compare
+       for i in 1 2 5 ; do
+           ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/put_all_kinds.nc$i ${TESTOUTDIR}/put_all_kinds.bb.nc$i
+           ${TESTSEQRUN} ${NCMPIDIFF} -q ${TESTOUTDIR}/iput_all_kinds.nc$i ${TESTOUTDIR}/iput_all_kinds.bb.nc$i
        done
-   done
-fi
+    fi
+    rm -f ${OUT_PATH}/put_all_kinds.nc*
+    rm -f ${OUT_PATH}/put_all_kinds.bb.nc*
+    rm -f ${OUT_PATH}/iput_all_kinds.nc*
+    rm -f ${OUT_PATH}/iput_all_kinds.bb.nc*
+
+    # echo ""
+
+    if test "${ENABLE_THREAD_SAFE}" = 1 ; then
+       # echo "---- testing thread safety"
+       for j in 0 1 ; do
+           export PNETCDF_SAFE_MODE=$j
+           # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+
+           ${TESTSEQRUN} ./tst_pthread ${TESTOUTDIR}/tst_pthread.nc
+           for i in 0 1 2 3 4 5 ; do
+               ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/tst_pthread.nc.$i
+               rm -f ${OUT_PATH}/tst_pthread.nc.$i
+           done
+       done
+    fi
+done
+done
diff --git a/test/testcases/test_vard.c b/test/testcases/test_vard.c
index 5ea0ded0f..3fbad06ad 100644
--- a/test/testcases/test_vard.c
+++ b/test/testcases/test_vard.c
@@ -64,6 +64,7 @@
             if (buf[j][i] != val+i) { \
                 printf("line %d: expecting buf[%d][%d]=%d but got %d\n",__LINE__,j,i,val+i,buf[j][i]); \
                 nerrs++; \
+                goto fn_exit; \
             } \
     } \
 } \
@@ -74,6 +75,7 @@
             if (buf[j][i] != rank*100+j*10+i) { \
                 printf("line %d: expecting buf[%d][%d]=%d but got %d\n",__LINE__,j,i,rank*100+j*10+i,(int)buf[j][i]); \
                 nerrs++; \
+                goto fn_exit; \
             } \
     } \
 }
@@ -143,20 +145,23 @@ int get_var_and_verify(int ncid,
                 nerrs++;
             }
     }
+
+fn_exit:
     free(ncbuf);
+
     return nerrs;
 }
 
 /*----< main() >------------------------------------------------------------*/
 int main(int argc, char **argv) {
 
-    char         filename[256];
+    char         filename[256], *hint_value;
     int          i, j, err, ncid, varid0, varid1, varid2, dimids[2], nerrs=0;
     int          rank, nprocs, blocklengths[2], **buf, *bufptr;
     int          array_of_sizes[2], array_of_subsizes[2], array_of_starts[2];
     int          buftype_size, expected_put_size, format;
-    float        **flt_buf, *flt_bufptr;
-    double       **dbl_buf, *dbl_bufptr;
+    float        **flt_buf=NULL, *flt_bufptr;
+    double       **dbl_buf=NULL, *dbl_bufptr;
     MPI_Offset   start[2], count[2], header_size, put_size, new_put_size;
     MPI_Aint     a0, a1, disps[2];
     MPI_Datatype buftype, ghost_buftype, rec_filetype, fix_filetype;
@@ -182,6 +187,19 @@ int main(int argc, char **argv) {
         free(cmd_str);
     }
 
+    /* Skip test when intra-node aggregation is enabled, as vard APIs are not
+     * supported.
+     */
+    if (inq_env_hint("nc_num_aggrs_per_node", &hint_value)) {
+        if (atoi(hint_value) > 0) {
+            free(hint_value);
+            if (rank == 0) printf(SKIP_STR);
+            MPI_Finalize();
+            return 0;
+        }
+        free(hint_value);
+    }
+
     /* construct various MPI derived data types */
 
     buf = (int**)malloc(sizeof(int*) * NY);
@@ -486,6 +504,7 @@ int main(int argc, char **argv) {
     }
     free(schar_buf);
 
+fn_exit:
     MPI_Type_free(&rec_filetype);
     MPI_Type_free(&fix_filetype);
     MPI_Type_free(&buftype);
@@ -495,8 +514,14 @@ int main(int argc, char **argv) {
     free(array_of_blocklengths);
     free(array_of_displacements);
     free(buf[0]); free(buf);
-    free(flt_buf[0]); free(flt_buf);
-    free(dbl_buf[0]); free(dbl_buf);
+    if (flt_buf != NULL) {
+        free(flt_buf[0]);
+        free(flt_buf);
+    }
+    if (dbl_buf != NULL) {
+        free(dbl_buf[0]);
+        free(dbl_buf);
+    }
 
     err = ncmpi_close(ncid); CHECK_ERR
 
diff --git a/test/testcases/test_vard_multiple.c b/test/testcases/test_vard_multiple.c
index 1e47dbeb4..5e1444942 100644
--- a/test/testcases/test_vard_multiple.c
+++ b/test/testcases/test_vard_multiple.c
@@ -77,7 +77,7 @@
 /*----< main() >------------------------------------------------------------*/
 int main(int argc, char **argv) {
 
-    char         filename[256];
+    char         filename[256], *hint_value;
     int          i, j, err, ncid, varid[4], dimids[3], nerrs=0, unlimit_dimid;
     int          rank, nprocs, *buf[2];
     int          array_of_sizes[2], array_of_subsizes[2], array_of_starts[2];
@@ -106,6 +106,19 @@ int main(int argc, char **argv) {
         free(cmd_str);
     }
 
+    /* Skip test when intra-node aggregation is enabled, as vard APIs are not
+     * supported.
+     */
+    if (inq_env_hint("nc_num_aggrs_per_node", &hint_value)) {
+        if (atoi(hint_value) > 0) {
+            free(hint_value);
+            if (rank == 0) printf(SKIP_STR);
+            MPI_Finalize();
+            return 0;
+        }
+        free(hint_value);
+    }
+
     buf[0] = (int*)malloc(sizeof(int) * NY * NX);
     for (j=0; j<NY; j++) for (i=0; i<NX; i++)
         buf[0][j*NX+i] = rank*100 + j*10 + i;
diff --git a/test/testcases/test_vard_rec.c b/test/testcases/test_vard_rec.c
index b5c75f04a..514be2054 100644
--- a/test/testcases/test_vard_rec.c
+++ b/test/testcases/test_vard_rec.c
@@ -43,7 +43,7 @@
 /*----< main() >------------------------------------------------------------*/
 int main(int argc, char **argv) {
 
-    char         filename[256];
+    char         filename[256], *hint_value;
     int          i, j, err, nerrs=0, ncid, varid, dimids[2], unlimit_dimid;
     int          rank, nprocs, verbose, array_of_blocklengths[2], buf[NY][NX];
     MPI_Offset   recsize, len;
@@ -72,6 +72,19 @@ int main(int argc, char **argv) {
         free(cmd_str);
     }
 
+    /* Skip test when intra-node aggregation is enabled, as vard APIs are not
+     * supported.
+     */
+    if (inq_env_hint("nc_num_aggrs_per_node", &hint_value)) {
+        if (atoi(hint_value) > 0) {
+            free(hint_value);
+            if (rank == 0) printf(SKIP_STR);
+            MPI_Finalize();
+            return 0;
+        }
+        free(hint_value);
+    }
+
     /* create a new file for write */
     err = ncmpi_create(MPI_COMM_WORLD, filename, NC_CLOBBER, MPI_INFO_NULL,
                        &ncid); CHECK_ERR
diff --git a/test/testcases/test_varm.c b/test/testcases/test_varm.c
index 0248ec400..cd84ca595 100644
--- a/test/testcases/test_varm.c
+++ b/test/testcases/test_varm.c
@@ -14,6 +14,8 @@
 
 #include <testutils.h>
 
+static int verbose;
+
 static int
 check_read_contents(float *rh)
 {
@@ -25,10 +27,9 @@ check_read_contents(float *rh)
     for (i=0; i<6; i++) {
         for (j=0; j<4; j++) {
             if (rh[j*6+i] != k) {
-#ifdef PRINT_ERR_ON_SCREEN
-                printf("Error at %s:%d : expect rh[%d][%d]=%f but got %f\n",
-                __FILE__,__LINE__,j,i,k,rh[j*6+i]);
-#endif
+                if (verbose)
+                    printf("Error at %s:%d : expect rh[%d][%d]=%f but got %f\n",
+                           __FILE__,__LINE__,j,i,k,rh[j*6+i]);
                 return 1;
             }
             k += 1.0;
@@ -71,11 +72,10 @@ check_write_contents(signed char *varT)
     for (j=0; j<4; j++) {
         for (i=0; i<6; i++) {
             if (varT[j*6+i] != j*6+i + 50) {
-#ifdef PRINT_ERR_ON_SCREEN
-                /* this error is a pnetcdf internal error, if occurs */
-                printf("Error at line %d in %s: expecting varT[%d][%d]=%d but got %d\n",
-                __LINE__,__FILE__,j,i,j*6+i + 50,varT[j*6+i]);
-#endif
+                if (verbose)
+                    /* this error is a pnetcdf internal error, if occurs */
+                    printf("Error at line %d in %s: expecting varT[%d][%d]=%d but got %d\n",
+                           __LINE__,__FILE__,j,i,j*6+i + 50,varT[j*6+i]);
                 return 1;
             }
         }
@@ -97,7 +97,8 @@ tst_fmt(char *filename, int cmode)
     MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
 
     cmode |= NC_CLOBBER;
-    err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid); CHECK_ERR
+    err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid);
+    CHECK_FATAL_ERR
 
     /* define a variable of a 6 x 4 integer array in the nc file */
     err = ncmpi_def_dim(ncid, "Y", 6, &dimid[0]); CHECK_ERR
@@ -124,7 +125,8 @@ tst_fmt(char *filename, int cmode)
 
     err = ncmpi_close(ncid); CHECK_ERR
 
-    err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid); CHECK_ERR
+    err = ncmpi_open(MPI_COMM_WORLD, filename, NC_NOWRITE, MPI_INFO_NULL, &ncid);
+    CHECK_FATAL_ERR
 
     err = ncmpi_inq_varid(ncid, "var", &varid); CHECK_ERR
 
@@ -177,7 +179,8 @@ tst_fmt(char *filename, int cmode)
 
     err = ncmpi_close(ncid); CHECK_ERR
 
-    err = ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, &ncid); CHECK_ERR
+    err = ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL, &ncid);
+    CHECK_FATAL_ERR
 
     err = ncmpi_inq_varid(ncid, "var", &varid); CHECK_ERR
 
@@ -186,7 +189,8 @@ tst_fmt(char *filename, int cmode)
     start[0] = 0; start[1] = 0;
     count[0] = 6; count[1] = 4;
     if (rank > 0) count[0] = count[1] = 0;
-    err = ncmpi_put_vara_int_all(ncid, varid, start, count, &var[0][0]); CHECK_ERR
+    err = ncmpi_put_vara_int_all(ncid, varid, start, count, &var[0][0]);
+    CHECK_ERR
 
     /* set the contents of the write buffer varT, a 4 x 6 char array
           50, 51, 52, 53, 54, 55,
@@ -236,6 +240,7 @@ tst_fmt(char *filename, int cmode)
 
     err = ncmpi_close(ncid); CHECK_ERR
 
+fn_exit:
     return nerrs;
 }
 
@@ -264,6 +269,8 @@ int main(int argc, char **argv)
         free(cmd_str);
     }
 
+    verbose = 1;
+
 #ifdef DEBUG
     if (nprocs > 1 && rank == 0)
         printf("Warning: %s is designed to run on 1 process\n", argv[0]);
diff --git a/test/testcases/tst_varn_var1.c b/test/testcases/tst_varn_var1.c
new file mode 100644
index 000000000..5217f7a3a
--- /dev/null
+++ b/test/testcases/tst_varn_var1.c
@@ -0,0 +1,217 @@
+/*********************************************************************
+ *
+ *  Copyright (C) 2025, Northwestern University and Argonne National Laboratory
+ *  See COPYRIGHT notice in top-level directory.
+ *
+ *********************************************************************/
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+ * This example tests a single call of ncmpi_put_varn_int_all() to write a
+ * sequence of requests with arbitrary array indices, all with length == 1.
+ *
+ * The compile and run commands are given below, together with an ncmpidump of
+ * the output file.
+ *
+ *    % mpicc -O2 -o tst_varn_var1 tst_varn_var1.c -lpnetcdf
+ *    % mpiexec -n 4 ./tst_varn_var1 /pvfs2/wkliao/testfile.nc
+ *    % ncmpidump /pvfs2/wkliao/testfile.nc
+ *    netcdf testfile {
+ *    // file format: CDF-5 (big variables)
+ *    dimensions:
+ *             Y = 4 ;
+ *             X = 10 ;
+ *             time = UNLIMITED ; // (4 currently)
+ *    variables:
+ *             int fix_var(Y, X) ;
+ *             int rec_var(time, X) ;
+ *    data:
+ *
+ *     fix_var =
+ *       0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _,
+ *       0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _,
+ *       0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _,
+ *       0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _ ;
+ *
+ *     rec_var =
+ *       0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _,
+ *       0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _,
+ *       0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _,
+ *       0, _, -1, _, -2, _, -3, _, -4, _, 0, _, -1, _, -2, _, -3, _, -4, _ ;
+ *    }
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h> /* strcpy(), memset() */
+#include <libgen.h> /* basename() */
+#include <mpi.h>
+#include <pnetcdf.h>
+
+#include <testutils.h>
+
+#define NY 4
+#define NX 4
+#define NDIMS 2
+
+int main(int argc, char** argv)
+{
+    char filename[256];
+    int i, j, k, rank, nprocs, err, nerrs=0;
+    int ncid, cmode, varid[2], dimid[2], nreqs, req, *buf;
+    MPI_Offset **starts=NULL;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
+
+    if (argc > 2) {
+        if (!rank) printf("Usage: %s [filename]\n",argv[0]);
+        MPI_Finalize();
+        return 1;
+    }
+    if (argc == 2) snprintf(filename, 256, "%s", argv[1]);
+    else           strcpy(filename, "testfile.nc");
+    MPI_Bcast(filename, 256, MPI_CHAR, 0, MPI_COMM_WORLD);
+
+    if (rank == 0) {
+        char *cmd_str = (char*)malloc(strlen(argv[0]) + 256);
+        sprintf(cmd_str, "*** TESTING C   %s for ncmpi_put_varn_int_all() ", basename(argv[0]));
+        printf("%-66s ------ ", cmd_str); fflush(stdout);
+        free(cmd_str);
+    }
+
+    buf = (int*) malloc(sizeof(int) * NY * NX);
+
+    nreqs = NY * NX * nprocs;
+    starts    = (MPI_Offset**) malloc(sizeof(MPI_Offset*) * nreqs);
+    starts[0] = (MPI_Offset*)  calloc(nreqs * NDIMS, sizeof(MPI_Offset));
+    for (i=1; i<nreqs; i++)
+        starts[i] = starts[i-1] + NDIMS;
+
+    /* create a new file for writing ----------------------------------------*/
+    cmode = NC_CLOBBER | NC_64BIT_DATA;
+    err = ncmpi_create(MPI_COMM_WORLD, filename, cmode, MPI_INFO_NULL, &ncid);
+    CHECK_ERR
+
+    /* create a global array of size NY * NX */
+    err = ncmpi_def_dim(ncid, "Y", NY, &dimid[0]);
+    CHECK_ERR
+    err = ncmpi_def_dim(ncid, "X", NX * nprocs, &dimid[1]);
+    CHECK_ERR
+    err = ncmpi_def_var(ncid, "fix_var", NC_INT, NDIMS, dimid, &varid[0]);
+    CHECK_ERR
+    err = ncmpi_def_dim(ncid, "time", NC_UNLIMITED, &dimid[0]);
+    CHECK_ERR
+    err = ncmpi_def_var(ncid, "rec_var", NC_INT, NDIMS, dimid, &varid[1]);
+    CHECK_ERR
+
+    err = ncmpi_set_fill(ncid, NC_FILL, NULL); CHECK_ERR
+    err = ncmpi_enddef(ncid);
+    CHECK_ERR
+
+    for (i=0; i<NY; i++) {
+        err = ncmpi_fill_var_rec(ncid, varid[1], i);
+        CHECK_ERR
+    }
+
+    /* test using blocking varn API with counts == NULL */
+
+    for (k=0; k<2; k++) {
+        /* write using put_varn API equivalent to multiple put_var1 */
+        nreqs = 0;
+        for (i=0; i<NY; i++) {
+            for (j=0; j<2; j++) {
+                starts[nreqs][0] = i;
+                starts[nreqs][1] = j * 2 * nprocs + 2 * rank;
+                buf[nreqs] = -rank;
+                nreqs++;
+            }
+        }
+        err = ncmpi_put_varn_int_all(ncid, varid[k], nreqs, starts, NULL, buf);
+        CHECK_ERR
+
+        /* read back */
+        for (i=0; i<nreqs; i++) buf[i] = -99;
+
+        /* read using varn API equivalent to multiple get_var1 */
+        err = ncmpi_get_varn_int_all(ncid, varid[k], nreqs, starts, NULL, buf);
+        CHECK_ERR
+
+        /* check read buf contents */
+        for (i=0; i<nreqs; i++)
+            if (buf[i] != -rank) {
+                printf("Error at line %d in %s: expecting buf[%d]=%d but got %d\n",
+                        __LINE__,__FILE__,i,-rank,buf[i]);
+                nerrs++;
+                goto err_out;
+            }
+    }
+
+    /* test using nonblocking varn API with counts == NULL */
+
+    for (k=0; k<2; k++) {
+        /* write using iput_varn API equivalent to multiple iput_var1 */
+        nreqs = 0;
+        for (i=0; i<NY; i++) {
+            for (j=0; j<2; j++) {
+                starts[nreqs][0] = i;
+                starts[nreqs][1] = j * 2 * nprocs + 2 * rank;
+                buf[nreqs] = -rank;
+                nreqs++;
+            }
+        }
+        err = ncmpi_iput_varn_int(ncid, varid[k], nreqs, starts, NULL, buf, &req);
+        CHECK_ERR
+
+        err = ncmpi_wait_all(ncid, 1, &req, NULL);
+        CHECK_ERR
+
+        /* read back */
+        for (i=0; i<nreqs; i++) buf[i] = -99;
+
+        /* read using varn API equivalent to multiple get_var1 */
+        err = ncmpi_iget_varn_int(ncid, varid[k], nreqs, starts, NULL, buf, &req);
+        CHECK_ERR
+
+        err = ncmpi_wait_all(ncid, 1, &req, NULL);
+        CHECK_ERR
+
+        /* check read buf contents */
+        for (i=0; i<nreqs; i++)
+            if (buf[i] != -rank) {
+                printf("Error at line %d in %s: expecting buf[%d]=%d but got %d\n",
+                        __LINE__,__FILE__,i,-rank,buf[i]);
+                nerrs++;
+                goto err_out;
+            }
+    }
+
+err_out:
+    err = ncmpi_close(ncid);
+    CHECK_ERR
+
+    free(buf);
+    free(starts[0]);
+    free(starts);
+
+    /* check if PnetCDF freed all internal malloc */
+    MPI_Offset malloc_size, sum_size;
+    err = ncmpi_inq_malloc_size(&malloc_size);
+    if (err == NC_NOERR) {
+        MPI_Reduce(&malloc_size, &sum_size, 1, MPI_OFFSET, MPI_SUM, 0, MPI_COMM_WORLD);
+        if (rank == 0 && sum_size > 0)
+            printf("heap memory allocated by PnetCDF internally has "OFFFMT" bytes yet to be freed\n",
+                   sum_size);
+        if (malloc_size > 0) ncmpi_inq_malloc_list();
+    }
+
+    MPI_Allreduce(MPI_IN_PLACE, &nerrs, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+    if (rank == 0) {
+        if (nerrs) printf(FAIL_STR,nerrs);
+        else       printf(PASS_STR);
+    }
+
+    MPI_Finalize();
+    return (nerrs > 0);
+}
+
diff --git a/test/testcases/tst_version.c b/test/testcases/tst_version.c
index eee7d65fb..74c4e73bc 100644
--- a/test/testcases/tst_version.c
+++ b/test/testcases/tst_version.c
@@ -2,7 +2,7 @@
  *  Copyright (C) 2019, Northwestern University and Argonne National Laboratory
  *  See COPYRIGHT notice in top-level directory.
  *
- *  Check whether PnetCDF version string returned from ncmpi_inq_libvers() 
+ *  Check whether PnetCDF version string returned from ncmpi_inq_libvers()
  *  matches the constant PNETCDF_VERSION defined in header file pnetcdf.h.
  *
  */
diff --git a/test/testcases/varn_int.c b/test/testcases/varn_int.c
index f82469086..4f5b2d045 100644
--- a/test/testcases/varn_int.c
+++ b/test/testcases/varn_int.c
@@ -23,11 +23,11 @@
  *             X = 10 ;
  *             REC_DIM = UNLIMITED ; // (4 currently)
  *    variables:
- *             int var(Y, X) ;
+ *             int fix_var(Y, X) ;
  *             int rec_var(REC_DIM, X) ;
  *    data:
  *
- *     var =
+ *     fix_var =
  *       13, 13, 13, 11, 11, 10, 10, 12, 11, 11,
  *       10, 12, 12, 12, 13, 11, 11, 12, 12, 12,
  *       11, 11, 12, 13, 13, 13, 10, 10, 11, 11,
@@ -91,7 +91,7 @@ int main(int argc, char** argv)
 {
     char filename[256];
     int i, j, rank, nprocs, err, nerrs=0;
-    int ncid, cmode, varid[3], dimid[2], num_reqs, *buffer, *r_buffer;
+    int ncid, cmode, varid[2], dimid[2], num_reqs, *buffer, *r_buffer;
     MPI_Offset w_len, **starts=NULL, **counts=NULL;
 
     MPI_Init(&argc, &argv);
@@ -129,7 +129,7 @@ int main(int argc, char** argv)
     CHECK_ERR
     err = ncmpi_def_dim(ncid, "X", NX, &dimid[1]);
     CHECK_ERR
-    err = ncmpi_def_var(ncid, "var", NC_INT, NDIMS, dimid, &varid[0]);
+    err = ncmpi_def_var(ncid, "fix_var", NC_INT, NDIMS, dimid, &varid[0]);
     CHECK_ERR
     err = ncmpi_def_dim(ncid, "REC_DIM", NC_UNLIMITED, &dimid[0]);
     CHECK_ERR
@@ -290,7 +290,7 @@ int main(int argc, char** argv)
 
     for (i=0; i<w_len; i++) {
         if (buffer[i] != rank+10) {
-            printf("Error at line %d in %s: expecting buffer[%d]=%d but got %d\n",
+            printf("Error at line %d in %s: expecting rec_var buffer[%d]=%d but got %d\n",
                    __LINE__,__FILE__,i,rank+10,buffer[i]);
             nerrs++;
             goto err_out;
@@ -310,7 +310,7 @@ int main(int argc, char** argv)
 
     for (i=0; i<w_len*2; i++) {
         if (i%2 && buffer[i] != -1) {
-            printf("Error at line %d in %s: expecting buffer[%d]=-1 but got %d\n",
+            printf("Error at line %d in %s: expecting rec_var buffer[%d]=-1 but got %d\n",
                    __LINE__,__FILE__,i,buffer[i]);
             nerrs++;
             goto err_out;
diff --git a/test/testcases/vectors.c b/test/testcases/vectors.c
index 0b7ef5e71..81ca63358 100644
--- a/test/testcases/vectors.c
+++ b/test/testcases/vectors.c
@@ -64,6 +64,9 @@ int main(int argc, char ** argv)
     err = ncmpi_def_var(ncid, "vector", NC_DOUBLE, 1, &dimid, &varid);
     CHECK_ERR
 
+    err = ncmpi_def_var_fill(ncid, varid, 0, NULL);
+    CHECK_ERR
+
     err = ncmpi_enddef(ncid);
     CHECK_ERR
 
diff --git a/test/testcases/wrap_runs.sh b/test/testcases/wrap_runs.sh
index 280216b4d..d1facea9b 100755
--- a/test/testcases/wrap_runs.sh
+++ b/test/testcases/wrap_runs.sh
@@ -1,4 +1,4 @@
-#!/bin/sh
+#!/bin/bash
 #
 # Copyright (C) 2003, Northwestern University and Argonne National Laboratory
 # See COPYRIGHT notice in top-level directory.
@@ -26,8 +26,37 @@ fi
 unset PNETCDF_HINTS
 
 for j in ${safe_modes} ; do
+    if test "$j" = 1 ; then # test only in safe mode
+       SAFE_HINTS="romio_no_indep_rw=true"
+    else
+       SAFE_HINTS="romio_no_indep_rw=false"
+    fi
+for mpiio_mode in 0 1 ; do
+    if test "$mpiio_mode" = 1 ; then
+       USEMPIO_HINTS="nc_pncio=disable"
+    else
+       USEMPIO_HINTS="nc_pncio=enable"
+    fi
+
+    if [[ "$1" == *"vard"* ]] ; then
+       if test "x$mpiio_mode" == x0 ; then
+          # vard APIs are not supported when using PNCIO
+          continue
+       fi
+    fi
+
+    PNETCDF_HINTS=
+    if test "x$SAFE_HINTS" != x ; then
+       PNETCDF_HINTS="$SAFE_HINTS"
+    fi
+    if test "x$USEMPIO_HINTS" != x ; then
+       PNETCDF_HINTS="$USEMPIO_HINTS;$PNETCDF_HINTS"
+    fi
+
+    export PNETCDF_HINTS="$PNETCDF_HINTS"
     export PNETCDF_SAFE_MODE=$j
-    # echo "---- set PNETCDF_SAFE_MODE ${PNETCDF_SAFE_MODE}"
+    # echo "PNETCDF_SAFE_MODE=$PNETCDF_SAFE_MODE PNETCDF_HINTS=$PNETCDF_HINTS"
+
     ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.nc
     ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.nc
     # echo ""
@@ -36,7 +65,7 @@ for j in ${safe_modes} ; do
        echo ""
        echo "---- testing burst buffering"
 
-       export PNETCDF_HINTS="nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
+       export PNETCDF_HINTS="$PNETCDF_HINTS;nc_burst_buf=enable;nc_burst_buf_dirname=${TESTOUTDIR};nc_burst_buf_overwrite=enable"
        ${TESTSEQRUN} $1              ${TESTOUTDIR}/$outfile.bb.nc
        unset PNETCDF_HINTS
        ${TESTSEQRUN} ${VALIDATOR} -q ${TESTOUTDIR}/$outfile.bb.nc
@@ -49,6 +78,7 @@ for j in ${safe_modes} ; do
        fi
    fi
 done
+done
 
 rm -f ${OUTDIR}/$outfile.nc*
 rm -f ${OUTDIR}/$outfile.bb.nc*