Skip to content

Commit 49ba1b4

Browse files
author
Shane Snyder
committed
updates for Aurora-DAOS regression testing
1 parent f2890c5 commit 49ba1b4

File tree

10 files changed

+188
-60
lines changed

10 files changed

+188
-60
lines changed
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
#!/bin/bash
2+
3+
module use /soft/modulefiles
4+
module load daos/base
5+
6+
launch-dfuse.sh ${DAOS_POOL}:${DAOS_CONT}
7+
mount | grep dfuse
8+
9+
nprocs=$DARSHAN_DEFAULT_NPROCS
10+
nnodes=`wc -l < $PBS_NODEFILE`
11+
ppn=$((nprocs / nnodes))
12+
13+
mpiexec -n $nprocs --ppn $ppn --env LD_PRELOAD=$DARSHAN_RUNTIME_PATH/lib/libdarshan.so $DARSHAN_SCRIPT_ARGS
14+
EXIT_STATUS=$?
15+
16+
clean-dfuse.sh ${DAOS_POOL}:${DAOS_CONT}
17+
18+
exit $EXIT_STATUS
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/bin/bash
2+
3+
PROJ=radix-io
4+
5+
# can't pass args to scripts with PBS, so we assign to an env
6+
# var and reference that in the submit script
7+
export DARSHAN_SCRIPT_ARGS="$@"
8+
9+
# set list of env vars to pass through to PBS job
10+
ENV_VAR_LIST="DARSHAN_LOGFILE,DARSHAN_DEFAULT_NPROCS,DARSHAN_SCRIPT_ARGS,DARSHAN_RUNTIME_PATH,DAOS_POOL,DAOS_CONT"
11+
if [ -n "${DXT_ENABLE_IO_TRACE+defined}" ]; then
12+
ENV_VAR_LIST="$ENV_VAR_LIST,DXT_ENABLE_IO_TRACE"
13+
fi
14+
15+
# submit job and wait for it to return
16+
jobid=`qsub -A $PROJ -q debug -l select=1,walltime=0:10:00,filesystems=home:daos_user,daos=daos_user -v $ENV_VAR_LIST -o $DARSHAN_TMP/$$-tmp.out -e $DARSHAN_TMP/$$-tmp.err $DARSHAN_TESTDIR/$DARSHAN_PLATFORM/pbs-submit.sh`
17+
18+
if [ $? -ne 0 ]; then
19+
echo "Error: failed to qsub $@"
20+
exit 1
21+
fi
22+
23+
# qstat seems to return errors a lot here... so use a retry loop
24+
retries=0
25+
max_retries=5
26+
while true; do
27+
sleep 5
28+
qstat_output=$(qstat -f -x "$jobid")
29+
if [[ $? -ne 0 ]]; then
30+
echo "qstat failed (attempt $((retries + 1)) of $max_retries)"
31+
((retries++))
32+
if [[ $retries -ge $max_retries ]]; then
33+
echo "qstat failed $max_retries times. Giving up."
34+
exit 1
35+
fi
36+
continue
37+
fi
38+
39+
# reset retry counter on successful qstat
40+
retries=0
41+
42+
# determine if job finished, and break out of loop if so
43+
job_state=$(echo "$qstat_output" | grep job_state | tr -d '[:blank:]' | cut -d= -f2)
44+
if [[ "$job_state" == "F" ]]; then
45+
break
46+
fi
47+
done
48+
49+
job_exit=$(echo "$qstat_output" | grep Exit_status | tr -d '[:blank:]' | cut -d= -f2)
50+
if [ $job_exit -ne 0 ]; then
51+
exit 1
52+
else
53+
exit 0
54+
fi

darshan-test/regression/alcf-aurora-ld-preload/runjob.sh

Lines changed: 0 additions & 39 deletions
This file was deleted.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/bin/bash
2+
3+
# General notes
4+
#######################
5+
6+
# Script to set up the environment for tests on this platform. Must export
7+
# the following environment variables:
8+
#
9+
# DARSHAN_CC: command to compile C programs
10+
# DARSHAN_CXX: command to compile C++ programs
11+
# DARSHAN_F90: command to compile Fortran90 programs
12+
# DARSHAN_F77: command to compile Fortran77 programs
13+
# DARSHAN_RUNJOB: command to execute a job and wait for its completion
14+
15+
# This script may load optional modules (as in a Cray PE), set LD_PRELOAD
16+
# variables (as in a dynamically linked environment), or generate mpicc
17+
# wrappers (as in a statically linked environment).
18+
19+
# Notes specific to this platform (alcf-aurora-ld-preload)
20+
########################
21+
# Use default compilers on Aurora and ultimately use LD_PRELOAD
22+
# (in the pbs-submit script) to instrument binaries.
23+
#
24+
# RUNJOB is responsible for submitting a PBS job, waiting for its
25+
# completion, and checking its return status
26+
27+
export DARSHAN_CC=mpicc
28+
export DARSHAN_CXX=mpicxx
29+
export DARSHAN_F77=mpifort
30+
export DARSHAN_F90=mpifort
31+
32+
export DARSHAN_RUNJOB=$DARSHAN_TESTDIR/$DARSHAN_PLATFORM/runjob.sh
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/bin/bash
2+
3+
PROJ=radix-io
4+
5+
# can't pass args to scripts with PBS, so we assign to an env
6+
# var and reference that in the submit script
7+
export DARSHAN_SCRIPT_ARGS="$@"
8+
9+
# set list of env vars to pass through to PBS job
10+
ENV_VAR_LIST="DARSHAN_LOGFILE,DARSHAN_DEFAULT_NPROCS,DARSHAN_SCRIPT_ARGS,DARSHAN_RUNTIME_PATH"
11+
if [ -n "${DXT_ENABLE_IO_TRACE+defined}" ]; then
12+
ENV_VAR_LIST="$ENV_VAR_LIST,DXT_ENABLE_IO_TRACE"
13+
fi
14+
15+
# submit job and wait for it to return
16+
jobid=`qsub -A $PROJ -q debug -l select=1,walltime=0:10:00,filesystems=home:flare -v $ENV_VAR_LIST -o $DARSHAN_TMP/$$-tmp.out -e $DARSHAN_TMP/$$-tmp.err $DARSHAN_TESTDIR/$DARSHAN_PLATFORM/pbs-submit.sh`
17+
18+
if [ $? -ne 0 ]; then
19+
echo "Error: failed to qsub $@"
20+
exit 1
21+
fi
22+
23+
# qstat seems to return errors a lot here... so use a retry loop
24+
retries=0
25+
max_retries=5
26+
while true; do
27+
sleep 5
28+
qstat_output=$(qstat -f -x "$jobid")
29+
if [[ $? -ne 0 ]]; then
30+
echo "qstat failed (attempt $((retries + 1)) of $max_retries)"
31+
((retries++))
32+
if [[ $retries -ge $max_retries ]]; then
33+
echo "qstat failed $max_retries times. Giving up."
34+
exit 1
35+
fi
36+
continue
37+
fi
38+
39+
# reset retry counter on successful qstat
40+
retries=0
41+
42+
# determine if job finished, and break out of loop if so
43+
job_state=$(echo "$qstat_output" | grep job_state | tr -d '[:blank:]' | cut -d= -f2)
44+
if [[ "$job_state" == "F" ]]; then
45+
break
46+
fi
47+
done
48+
49+
job_exit=$(echo "$qstat_output" | grep Exit_status | tr -d '[:blank:]' | cut -d= -f2)
50+
if [ $job_exit -ne 0 ]; then
51+
exit 1
52+
else
53+
exit 0
54+
fi

darshan-test/regression/test-cases/fperf-f77.sh

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,18 @@ if [ $? -ne 0 ]; then
2828
fi
2929

3030
# check results
31-
# in this case we want to confirm that both the MPI and POSIX open counters were triggered
32-
POSIX_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
33-
if [ ! "$POSIX_OPENS" -gt 0 ]; then
34-
echo "Error: POSIX open count of $POSIX_OPENS is incorrect" 1>&2
31+
# in this case we want to confirm that both the MPI and low-level (POSIX or DFS) open counters were triggered
32+
FILE_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
33+
if [ -z "$FILE_OPENS" ]; then
34+
FILE_OPENS=`grep DFS_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
35+
fi
36+
if [ ! "$FILE_OPENS" -gt 0 ]; then
37+
echo "Error: file open count of $FILE_OPENS is incorrect" 1>&2
3538
exit 1
3639
fi
37-
MPI_OPENS=`grep COLL_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
38-
if [ ! "$MPI_OPENS" -gt 0 ]; then
39-
echo "Error: MPI open count of $MPI_OPENS is incorrect" 1>&2
40+
MPIIO_OPENS=`grep COLL_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
41+
if [ ! "$MPIIO_OPENS" -gt 0 ]; then
42+
echo "Error: MPI-IO open count of $MPIIO_OPENS is incorrect" 1>&2
4043
exit 1
4144
fi
4245

darshan-test/regression/test-cases/fperf-f90.sh

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,18 @@ if [ $? -ne 0 ]; then
2828
fi
2929

3030
# check results
31-
# in this case we want to confirm that both the MPI and POSIX open counters were triggered
32-
POSIX_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
33-
if [ ! "$POSIX_OPENS" -gt 0 ]; then
34-
echo "Error: POSIX open count of $POSIX_OPENS is incorrect" 1>&2
31+
# in this case we want to confirm that both the MPI and low-level (POSIX or DFS) open counters were triggered
32+
FILE_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
33+
if [ -z "$FILE_OPENS" ]; then
34+
FILE_OPENS=`grep DFS_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
35+
fi
36+
if [ ! "$FILE_OPENS" -gt 0 ]; then
37+
echo "Error: file open count of $FILE_OPENS is incorrect" 1>&2
3538
exit 1
3639
fi
37-
MPI_OPENS=`grep COLL_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
38-
if [ ! "$MPI_OPENS" -gt 0 ]; then
39-
echo "Error: MPI open count of $MPI_OPENS is incorrect" 1>&2
40+
MPIIO_OPENS=`grep COLL_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
41+
if [ ! "$MPIIO_OPENS" -gt 0 ]; then
42+
echo "Error: MPI open count of $MPIIO_OPENS is incorrect" 1>&2
4043
exit 1
4144
fi
4245

darshan-test/regression/test-cases/mpi-io-test.sh

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,15 +28,18 @@ if [ $? -ne 0 ]; then
2828
fi
2929

3030
# check results
31-
# in this case we want to confirm that both the MPI and POSIX open counters were triggered
32-
POSIX_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
33-
if [ ! "$POSIX_OPENS" -gt 0 ]; then
34-
echo "Error: POSIX open count of $POSIX_OPENS is incorrect" 1>&2
31+
# in this case we want to confirm that both the MPI and low-level (POSIX or DFS) open counters were triggered
32+
FILE_OPENS=`grep POSIX_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
33+
if [ -z "$FILE_OPENS" ]; then
34+
FILE_OPENS=`grep DFS_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
35+
fi
36+
if [ ! "$FILE_OPENS" -gt 0 ]; then
37+
echo "Error: file open count of $FILE_OPENS is incorrect" 1>&2
3538
exit 1
3639
fi
37-
MPI_OPENS=`grep INDEP_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
38-
if [ ! "$MPI_OPENS" -gt 0 ]; then
39-
echo "Error: MPI open count of $MPI_OPENS is incorrect" 1>&2
40+
MPIIO_OPENS=`grep INDEP_OPENS $DARSHAN_TMP/${PROG}.darshan.txt |grep -vE "^#" |cut -f 5`
41+
if [ ! "$MPIIO_OPENS" -gt 0 ]; then
42+
echo "Error: MPI open count of $MPIIO_OPENS is incorrect" 1>&2
4043
exit 1
4144
fi
4245

0 commit comments

Comments
 (0)