updates for Aurora-DAOS regression testing

Shane Snyder · Shane Snyder · commit 49ba1b40e50c · 2025-05-09T04:48:14.000Z
diff --git a/darshan-test/regression/alcf-aurora-daos-ld-preload/env.sh b/darshan-test/regression/alcf-aurora-daos-ld-preload/env.sh
diff --git a/darshan-test/regression/alcf-aurora-daos-ld-preload/pbs-submit.sh b/darshan-test/regression/alcf-aurora-daos-ld-preload/pbs-submit.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+module use /soft/modulefiles
+module load daos/base
+
+launch-dfuse.sh ${DAOS_POOL}:${DAOS_CONT}
+mount | grep dfuse
+
+nprocs=$DARSHAN_DEFAULT_NPROCS
+nnodes=`wc -l < $PBS_NODEFILE`
+ppn=$((nprocs / nnodes))
+
+mpiexec -n $nprocs --ppn $ppn --env LD_PRELOAD=$DARSHAN_RUNTIME_PATH/lib/libdarshan.so $DARSHAN_SCRIPT_ARGS
+EXIT_STATUS=$?
+
+clean-dfuse.sh ${DAOS_POOL}:${DAOS_CONT}
+
+exit $EXIT_STATUS
diff --git a/darshan-test/regression/alcf-aurora-daos-ld-preload/runjob.sh b/darshan-test/regression/alcf-aurora-daos-ld-preload/runjob.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+PROJ=radix-io
+
+# can't pass args to scripts with PBS, so we assign to an env
+# var and reference that in the submit script
+export DARSHAN_SCRIPT_ARGS="$@"
+
+# set list of env vars to pass through to PBS job
+ENV_VAR_LIST="DARSHAN_LOGFILE,DARSHAN_DEFAULT_NPROCS,DARSHAN_SCRIPT_ARGS,DARSHAN_RUNTIME_PATH,DAOS_POOL,DAOS_CONT"
+if [ -n "${DXT_ENABLE_IO_TRACE+defined}" ]; then
+	ENV_VAR_LIST="$ENV_VAR_LIST,DXT_ENABLE_IO_TRACE"
+fi
+
+# submit job and wait for it to return
+jobid=`qsub -A $PROJ -q debug -l select=1,walltime=0:10:00,filesystems=home:daos_user,daos=daos_user -v $ENV_VAR_LIST -o $DARSHAN_TMP/$$-tmp.out -e $DARSHAN_TMP/$$-tmp.err $DARSHAN_TESTDIR/$DARSHAN_PLATFORM/pbs-submit.sh`
+
+if [ $? -ne 0 ]; then
+        echo "Error: failed to qsub $@"
+        exit 1
+fi
+
+# qstat seems to return errors a lot here... so use a retry loop
+retries=0
+max_retries=5
+while true; do
+    sleep 5
+    qstat_output=$(qstat -f -x "$jobid")
+    if [[ $? -ne 0 ]]; then
+        echo "qstat failed (attempt $((retries + 1)) of $max_retries)"
+        ((retries++))
+        if [[ $retries -ge $max_retries ]]; then
+            echo "qstat failed $max_retries times. Giving up."
+            exit 1
+        fi
+        continue
+    fi
+
+    # reset retry counter on successful qstat
+    retries=0
+
+    # determine if job finished, and break out of loop if so
+    job_state=$(echo "$qstat_output" | grep job_state | tr -d '[:blank:]' | cut -d= -f2)
+    if [[ "$job_state" == "F" ]]; then
+        break
+    fi
+done
+
+job_exit=$(echo "$qstat_output" | grep Exit_status | tr -d '[:blank:]' | cut -d= -f2)
+if [ $job_exit -ne 0 ]; then
+    exit 1
+else
+    exit 0
+fi
diff --git a/darshan-test/regression/alcf-aurora-ld-preload/runjob.sh b/darshan-test/regression/alcf-aurora-ld-preload/runjob.sh
diff --git a/darshan-test/regression/alcf-aurora-lustre-ld-preload/env.sh b/darshan-test/regression/alcf-aurora-lustre-ld-preload/env.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# General notes
+#######################
+
+# Script to set up the environment for tests on this platform.  Must export
+# the following environment variables:
+# 
+# DARSHAN_CC: command to compile C programs
+# DARSHAN_CXX: command to compile C++ programs
+# DARSHAN_F90: command to compile Fortran90 programs
+# DARSHAN_F77: command to compile Fortran77 programs
+# DARSHAN_RUNJOB: command to execute a job and wait for its completion
+
+# This script may load optional modules (as in a Cray PE), set LD_PRELOAD
+# variables (as in a dynamically linked environment), or generate mpicc
+# wrappers (as in a statically linked environment).
+
+# Notes specific to this platform (alcf-aurora-ld-preload)
+########################
+# Use default compilers on Aurora and ultimately use LD_PRELOAD
+# (in the pbs-submit script) to instrument binaries.
+#
+# RUNJOB is responsible for submitting a PBS job, waiting for its
+# completion, and checking its return status
+
+export DARSHAN_CC=mpicc
+export DARSHAN_CXX=mpicxx
+export DARSHAN_F77=mpifort
+export DARSHAN_F90=mpifort
+
+export DARSHAN_RUNJOB=$DARSHAN_TESTDIR/$DARSHAN_PLATFORM/runjob.sh
diff --git a/darshan-test/regression/alcf-aurora-lustre-ld-preload/pbs-submit.sh b/darshan-test/regression/alcf-aurora-lustre-ld-preload/pbs-submit.sh
diff --git a/darshan-test/regression/alcf-aurora-lustre-ld-preload/runjob.sh b/darshan-test/regression/alcf-aurora-lustre-ld-preload/runjob.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+PROJ=radix-io
+
+# can't pass args to scripts with PBS, so we assign to an env
+# var and reference that in the submit script
+export DARSHAN_SCRIPT_ARGS="$@"
+
+# set list of env vars to pass through to PBS job
+ENV_VAR_LIST="DARSHAN_LOGFILE,DARSHAN_DEFAULT_NPROCS,DARSHAN_SCRIPT_ARGS,DARSHAN_RUNTIME_PATH"
+if [ -n "${DXT_ENABLE_IO_TRACE+defined}" ]; then
+	ENV_VAR_LIST="$ENV_VAR_LIST,DXT_ENABLE_IO_TRACE"
+fi
+
+# submit job and wait for it to return
+jobid=`qsub -A $PROJ -q debug -l select=1,walltime=0:10:00,filesystems=home:flare -v $ENV_VAR_LIST -o $DARSHAN_TMP/$$-tmp.out -e $DARSHAN_TMP/$$-tmp.err $DARSHAN_TESTDIR/$DARSHAN_PLATFORM/pbs-submit.sh`
+
+if [ $? -ne 0 ]; then
+        echo "Error: failed to qsub $@"
+        exit 1
+fi
+
+# qstat seems to return errors a lot here... so use a retry loop
+retries=0
+max_retries=5
+while true; do
+    sleep 5
+    qstat_output=$(qstat -f -x "$jobid")
+    if [[ $? -ne 0 ]]; then
+        echo "qstat failed (attempt $((retries + 1)) of $max_retries)"
+        ((retries++))
+        if [[ $retries -ge $max_retries ]]; then
+            echo "qstat failed $max_retries times. Giving up."
+            exit 1
+        fi
+        continue
+    fi
+
+    # reset retry counter on successful qstat
+    retries=0
+
+    # determine if job finished, and break out of loop if so
+    job_state=$(echo "$qstat_output" | grep job_state | tr -d '[:blank:]' | cut -d= -f2)
+    if [[ "$job_state" == "F" ]]; then
+        break
+    fi
+done
+
+job_exit=$(echo "$qstat_output" | grep Exit_status | tr -d '[:blank:]' | cut -d= -f2)
+if [ $job_exit -ne 0 ]; then
+	exit 1
+else
+	exit 0
+fi
diff --git a/darshan-test/regression/test-cases/fperf-f77.sh b/darshan-test/regression/test-cases/fperf-f77.sh
@@ -28,15 +28,18 @@ if [ $? -ne 0 ]; then
 fi
 
 # check results
-# in this case we want to confirm that both the MPI and POSIX open counters were triggered
-POSIX_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
-if [ ! "$POSIX_OPENS" -gt 0 ]; then
-    echo "Error: POSIX open count of $POSIX_OPENS is incorrect" 1>&2
+# in this case we want to confirm that both the MPI and low-level (POSIX or DFS) open counters were triggered
+FILE_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
+if [ -z "$FILE_OPENS" ]; then
+    FILE_OPENS=`grep DFS_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
+fi
+if [ ! "$FILE_OPENS" -gt 0 ]; then
+    echo "Error: file open count of $FILE_OPENS is incorrect" 1>&2
     exit 1
 fi
-MPI_OPENS=`grep COLL_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
-if [ ! "$MPI_OPENS" -gt 0 ]; then
-    echo "Error: MPI open count of $MPI_OPENS is incorrect" 1>&2
+MPIIO_OPENS=`grep COLL_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
+if [ ! "$MPIIO_OPENS" -gt 0 ]; then
+    echo "Error: MPI-IO open count of $MPIIO_OPENS is incorrect" 1>&2
     exit 1
 fi
 
diff --git a/darshan-test/regression/test-cases/fperf-f90.sh b/darshan-test/regression/test-cases/fperf-f90.sh
@@ -28,15 +28,18 @@ if [ $? -ne 0 ]; then
 fi
 
 # check results
-# in this case we want to confirm that both the MPI and POSIX open counters were triggered
-POSIX_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
-if [ ! "$POSIX_OPENS" -gt 0 ]; then
-    echo "Error: POSIX open count of $POSIX_OPENS is incorrect" 1>&2
+# in this case we want to confirm that both the MPI and low-level (POSIX or DFS) open counters were triggered
+FILE_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
+if [ -z "$FILE_OPENS" ]; then
+    FILE_OPENS=`grep DFS_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
+fi
+if [ ! "$FILE_OPENS" -gt 0 ]; then
+    echo "Error: file open count of $FILE_OPENS is incorrect" 1>&2
     exit 1
 fi
-MPI_OPENS=`grep COLL_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
-if [ ! "$MPI_OPENS" -gt 0 ]; then
-    echo "Error: MPI open count of $MPI_OPENS is incorrect" 1>&2
+MPIIO_OPENS=`grep COLL_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
+if [ ! "$MPIIO_OPENS" -gt 0 ]; then
+    echo "Error: MPI open count of $MPIIO_OPENS is incorrect" 1>&2
     exit 1
 fi
 
diff --git a/darshan-test/regression/test-cases/mpi-io-test.sh b/darshan-test/regression/test-cases/mpi-io-test.sh
@@ -28,15 +28,18 @@ if [ $? -ne 0 ]; then
 fi
 
 # check results
-# in this case we want to confirm that both the MPI and POSIX open counters were triggered
-POSIX_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
-if [ ! "$POSIX_OPENS" -gt 0 ]; then
-    echo "Error: POSIX open count of $POSIX_OPENS is incorrect" 1>&2
+# in this case we want to confirm that both the MPI and low-level (POSIX or DFS) open counters were triggered
+FILE_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
+if [ -z "$FILE_OPENS" ]; then
+    FILE_OPENS=`grep DFS_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
+fi
+if [ ! "$FILE_OPENS" -gt 0 ]; then
+    echo "Error: file open count of $FILE_OPENS is incorrect" 1>&2
     exit 1
 fi
-MPI_OPENS=`grep INDEP_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
-if [ ! "$MPI_OPENS" -gt 0 ]; then
-    echo "Error: MPI open count of $MPI_OPENS is incorrect" 1>&2
+MPIIO_OPENS=`grep INDEP_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
+if [ ! "$MPIIO_OPENS" -gt 0 ]; then
+    echo "Error: MPI open count of $MPIIO_OPENS is incorrect" 1>&2
     exit 1
 fi