BLAST-WarpX · EZoni · Oct 14, 2024 · Sep 27, 2024
diff --git a/Tools/machines/greatlakes-umich/greatlakes_v100.sbatch b/Tools/machines/greatlakes-umich/greatlakes_v100.sbatch
@@ -26,8 +26,7 @@ INPUTS=inputs
 #   per node are 2x 2.4 GHz Intel Xeon Gold 6148
 #   note: the system seems to only expose cores (20 per socket),
 #         not hyperthreads (40 per socket)
-export SRUN_CPUS_PER_TASK=20
-export OMP_NUM_THREADS=${SRUN_CPUS_PER_TASK}
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
 
 # GPU-aware MPI optimizations
 GPU_AWARE_MPI="amrex.use_gpu_aware_mpi=1"

diff --git a/Tools/machines/karolina-it4i/karolina_gpu.sbatch b/Tools/machines/karolina-it4i/karolina_gpu.sbatch
@@ -25,13 +25,12 @@
 #SBATCH -o stdout_%j
 #SBATCH -e stderr_%j
 
-# OpenMP threads per MPI rank
-export OMP_NUM_THREADS=16
-export SRUN_CPUS_PER_TASK=16
-
 # set user rights to u=rwx;g=r-x;o=---
 umask 0027
 
+# OpenMP threads per MPI rank
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
+
 # executable & inputs file or python interpreter & PICMI script here
 EXE=./warpx.rz
 INPUTS=./inputs_rz

diff --git a/Tools/machines/lonestar6-tacc/lonestar6_a100.sbatch b/Tools/machines/lonestar6-tacc/lonestar6_a100.sbatch
@@ -14,6 +14,7 @@
 #SBATCH -q regular
 #SBATCH -C gpu
 #SBATCH --exclusive
+#SBATCH --cpus-per-task=32
 #SBATCH --gpu-bind=none
 #SBATCH --gpus-per-node=4
 #SBATCH -o WarpX.o%j
@@ -27,7 +28,7 @@ INPUTS=inputs_small
 export MPICH_OFI_NIC_POLICY=GPU
 
 # threads for OpenMP and threaded compressors per MPI rank
-export SRUN_CPUS_PER_TASK=32
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
 
 # depends on https://github.com/ECP-WarpX/WarpX/issues/2009
 #GPU_AWARE_MPI="amrex.the_arena_is_managed=0 amrex.use_gpu_aware_mpi=1"

diff --git a/Tools/machines/perlmutter-nersc/perlmutter_cpu.sbatch b/Tools/machines/perlmutter-nersc/perlmutter_cpu.sbatch
@@ -13,6 +13,8 @@
 #SBATCH -A <proj>
 #SBATCH -q regular
 #SBATCH -C cpu
+# 8 cores per chiplet, 2x SMP
+#SBATCH --cpus-per-task=16
 #SBATCH --ntasks-per-node=16
 #SBATCH --exclusive
 #SBATCH -o WarpX.o%j
@@ -30,10 +32,9 @@ INPUTS=inputs_small
 # This will be our MPI rank assignment (2x8 is 16 ranks/node).
 
 # threads for OpenMP and threaded compressors per MPI rank
-export SRUN_CPUS_PER_TASK=16  # 8 cores per chiplet, 2x SMP
 export OMP_PLACES=threads
 export OMP_PROC_BIND=spread
-export OMP_NUM_THREADS=${SRUN_CPUS_PER_TASK}
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
 
 srun --cpu-bind=cores \
   ${EXE} ${INPUTS} \

diff --git a/Tools/machines/perlmutter-nersc/perlmutter_gpu.sbatch b/Tools/machines/perlmutter-nersc/perlmutter_gpu.sbatch
@@ -17,6 +17,7 @@
 # A100 80GB (256 nodes)
 #S BATCH -C gpu&hbm80g
 #SBATCH --exclusive
+#SBATCH --cpus-per-task=16
 # ideally single:1, but NERSC cgroups issue
 #SBATCH --gpu-bind=none
 #SBATCH --ntasks-per-node=4
@@ -33,8 +34,7 @@ export MPICH_OFI_NIC_POLICY=GPU
 
 # threads for OpenMP and threaded compressors per MPI rank
 #   note: 16 avoids hyperthreading (32 virtual cores, 16 physical)
-export SRUN_CPUS_PER_TASK=16
-export OMP_NUM_THREADS=${SRUN_CPUS_PER_TASK}
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
 
 # GPU-aware MPI optimizations
 GPU_AWARE_MPI="amrex.use_gpu_aware_mpi=1"

diff --git a/Tools/machines/tioga-llnl/tioga_mi300a.sbatch b/Tools/machines/tioga-llnl/tioga_mi300a.sbatch
@@ -12,6 +12,7 @@
 #SBATCH -J WarpX
 #S BATCH -A <proj>  # project name not needed yet
 #SBATCH -p mi300a
+#SBATCH --cpus-per-task=16
 #SBATCH --gpu-bind=none
 #SBATCH --ntasks-per-node=4
 #SBATCH --gpus-per-node=4
@@ -27,8 +28,7 @@ export MPICH_OFI_NIC_POLICY=GPU
 
 # threads for OpenMP and threaded compressors per MPI rank
 #   note: 16 avoids hyperthreading (32 virtual cores, 16 physical)
-export SRUN_CPUS_PER_TASK=16
-export OMP_NUM_THREADS=${SRUN_CPUS_PER_TASK}
+export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
 
 # GPU-aware MPI optimizations
 GPU_AWARE_MPI="amrex.use_gpu_aware_mpi=1"