Skip to content

Commit 1d3f761

Browse files
committed
Merge branch 'azamat/aurora/tiles-cores-modules' into next (PR #7399)
Load recommended mpich-config modules and env-vars on Aurora Also, - set env-vars and export to mpiexec unlimited core file size limit in debug runs - update `jobid_pattern` to refine job-id extraction from non-standard PBS output - cleanup tabs [BFB]
2 parents 31ed3b3 + 429b10f commit 1d3f761

File tree

2 files changed

+44
-35
lines changed

2 files changed

+44
-35
lines changed

cime_config/machines/config_batch.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,7 @@
553553

554554
<batch_system MACH="aurora" type="pbspro">
555555
<batch_submit>/lus/flare/projects/E3SM_Dec/tools/qsub/throttle</batch_submit>
556+
<jobid_pattern>(\d+)\.aurora-pbs</jobid_pattern>
556557
<directives>
557558
<directive> -l filesystems=home:flare </directive>
558559
</directives>

cime_config/machines/config_machines.xml

Lines changed: 43 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -3608,66 +3608,68 @@
36083608
<MAX_MPITASKS_PER_NODE compiler="oneapi-ifxgpu">12</MAX_MPITASKS_PER_NODE>
36093609
<PROJECT_REQUIRED>FALSE</PROJECT_REQUIRED>
36103610
<mpirun mpilib="default">
3611-
<executable>mpiexec</executable>
3612-
<!--executable>numactl -m 2-3 mpiexec</executable--><!--for HBM runs-->
3613-
<arguments>
3614-
<arg name="total_num_tasks">-np {{ total_tasks }} --label</arg>
3615-
<arg name="ranks_per_node">-ppn {{ tasks_per_node }}</arg>
3616-
<arg name="ranks_bind">--cpu-bind $ENV{RANKS_BIND}</arg>
3617-
<arg name="threads_per_rank">-d $ENV{OMP_NUM_THREADS}</arg>
3618-
<arg name="gpu_maps">$ENV{GPU_TILE_COMPACT}</arg>
3619-
</arguments>
3611+
<executable>mpiexec</executable>
3612+
<!--executable>numactl -m 2-3 mpiexec</executable--><!--for HBM runs-->
3613+
<arguments>
3614+
<arg name="total_num_tasks">-np {{ total_tasks }} --label</arg>
3615+
<arg name="ranks_per_node">-ppn {{ tasks_per_node }}</arg>
3616+
<arg name="ranks_bind">--cpu-bind $ENV{RANKS_BIND}</arg>
3617+
<arg name="threads_per_rank">-d $ENV{OMP_NUM_THREADS} $ENV{RLIMITS}</arg>
3618+
<arg name="gpu_maps">$ENV{GPU_TILE_COMPACT}</arg>
3619+
</arguments>
36203620
</mpirun>
36213621
<module_system type="module" allow_error="true">
3622-
<init_path lang="sh">/usr/share/lmod/lmod/init/sh</init_path>
3623-
<init_path lang="csh">/usr/share/lmod/lmod/init/csh</init_path>
3624-
<init_path lang="python">/usr/share/lmod/lmod/init/env_modules_python.py</init_path>
3625-
<cmd_path lang="sh">module</cmd_path>
3626-
<cmd_path lang="csh">module</cmd_path>
3627-
<cmd_path lang="python">/usr/share/lmod/lmod/libexec/lmod python</cmd_path>
3628-
<modules>
3629-
<command name="load">cmake/3.30.5</command>
3630-
<command name="load">oneapi/release/2025.0.5</command>
3631-
</modules>
3632-
</module_system>
3633-
<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
3634-
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
3635-
<MAX_GB_OLD_TEST_DATA>0</MAX_GB_OLD_TEST_DATA>
3636-
<environment_variables>
3622+
<init_path lang="sh">/usr/share/lmod/lmod/init/sh</init_path>
3623+
<init_path lang="csh">/usr/share/lmod/lmod/init/csh</init_path>
3624+
<init_path lang="python">/usr/share/lmod/lmod/init/env_modules_python.py</init_path>
3625+
<cmd_path lang="sh">module</cmd_path>
3626+
<cmd_path lang="csh">module</cmd_path>
3627+
<cmd_path lang="python">/usr/share/lmod/lmod/libexec/lmod python</cmd_path>
3628+
<modules>
3629+
<command name="load">cmake/3.30.5</command>
3630+
<command name="load">oneapi/release/2025.0.5</command>
3631+
<command name="load">mpich-config/collective-tuning/1024</command>
3632+
</modules>
3633+
</module_system>
3634+
<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
3635+
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
3636+
<MAX_GB_OLD_TEST_DATA>0</MAX_GB_OLD_TEST_DATA>
3637+
<environment_variables>
36373638
<env name="NETCDF_PATH">/lus/flare/projects/E3SM_Dec/soft/netcdf/4.9.2c-4.6.1f/oneapi.eng.2024.07.30.002</env>
36383639
<env name="PNETCDF_PATH">/lus/flare/projects/E3SM_Dec/soft/pnetcdf/1.14.0/oneapi.eng.2024.07.30.002</env>
36393640
<env name="LD_LIBRARY_PATH">/lus/flare/projects/E3SM_Dec/soft/pnetcdf/1.14.0/oneapi.eng.2024.07.30.002/lib:/lus/flare/projects/E3SM_Dec/soft/netcdf/4.9.2c-4.6.1f/oneapi.eng.2024.07.30.002/lib:$ENV{LD_LIBRARY_PATH}</env>
36403641
<env name="PATH">/lus/flare/projects/E3SM_Dec/soft/pnetcdf/1.14.0/oneapi.eng.2024.07.30.002/bin:/lus/flare/projects/E3SM_Dec/soft/netcdf/4.9.2c-4.6.1f/oneapi.eng.2024.07.30.002/bin:$ENV{PATH}</env>
36413642
<env name="FI_CXI_DEFAULT_CQ_SIZE">131072</env>
36423643
<env name="FI_CXI_CQ_FILL_PERCENT">20</env>
3644+
<env name="RLIMITS"> </env>
36433645
</environment_variables>
36443646
<environment_variables compiler="oneapi-ifxgpu">
36453647
<env name="ONEAPI_DEVICE_SELECTOR">level_zero:gpu</env>
3646-
<env name="MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE"></env>
3647-
<env name="MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE"></env>
3648-
<env name="MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE"></env>
36493648
<env name="UR_L0_USE_DRIVER_INORDER_LISTS">1</env>
36503649
<env name="UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS">1</env>
36513650
<env name="UR_L0_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE">1</env>
3652-
<!--<env name="FI_PROVIDER">cxi</env>-->
3653-
<env name="FI_MR_CACHE_MONITOR">disabled</env>
3651+
<env name="FI_MR_CACHE_MONITOR">disabled</env>
36543652
<env name="FI_CXI_OVFLOW_BUF_SIZE">8388608</env>
36553653
<env name="PALS_PING_PERIOD">240</env>
36563654
<env name="PALS_RPC_TIMEOUT">240</env>
3655+
<env name="SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE">1</env>
3656+
<env name="SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR">1</env>
3657+
<env name="SYCL_PI_LEVEL_ZERO_USM_RESIDENT">0x001</env>
3658+
<env name="UR_L0_USE_DRIVER_INORDER_LISTS">1</env>
3659+
<env name="UR_L0_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE">1</env>
36573660

3658-
<env name="MPIR_CVAR_ENABLE_GPU">1</env>
3661+
<env name="MPIR_CVAR_ENABLE_GPU">1</env>
36593662
<env name="romio_cb_read">disable</env>
36603663
<env name="romio_cb_write">disable</env>
3661-
<env name="SYCL_CACHE_PERSISTENT">1</env>
36623664
<env name="GATOR_INITIAL_MB">4000MB</env>
36633665
<env name="GATOR_DISABLE">0</env>
36643666
<env name="GPU_TILE_COMPACT">/lus/flare/projects/E3SM_Dec/tools/mpi_wrapper_utils/gpu_tile_compact.sh</env>
36653667
<env name="RANKS_BIND">list:1-8:9-16:17-24:25-32:33-40:41-48:53-60:61-68:69-76:77-84:85-92:93-100 --gpu-bind list:0.0:0.1:1.0:1.1:2.0:2.1:3.0:3.1:4.0:4.1:5.0:5.1 --mem-bind list:0:0:0:0:0:0:1:1:1:1:1:1</env>
36663668
<env name="ZES_ENABLE_SYSMAN">1</env>
3667-
<!-- default is ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE: enable this to run 4 MPI/tile or 48 MPI/node
3668-
<env name="ZEX_NUMBER_OF_CCS">0:4,1:4,2:4,3:4:4:4,5:4</env>-->
3669-
<!-- <env name="ZE_FLAT_DEVICE_HIERARCHY">FLAT</env>
3670-
<env name="ZEX_NUMBER_OF_CCS">0:4,1:4,2:4,3:4:4:4,5:4,6:4,7:4,8:4,9:4,10:4,11:4</env>-->
3669+
<!-- default is ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE: enable this to run 4 MPI/tile or 48 MPI/node
3670+
<env name="ZEX_NUMBER_OF_CCS">0:4,1:4,2:4,3:4:4:4,5:4</env>-->
3671+
<!-- <env name="ZE_FLAT_DEVICE_HIERARCHY">FLAT</env>
3672+
<env name="ZEX_NUMBER_OF_CCS">0:4,1:4,2:4,3:4:4:4,5:4,6:4,7:4,8:4,9:4,10:4,11:4</env>-->
36713673
</environment_variables>
36723674
<environment_variables compiler="oneapi-ifx">
36733675
<env name="LIBOMPTARGET_DEBUG">0</env><!--default 0, max 5 -->
@@ -3680,6 +3682,12 @@
36803682
<env name="KMP_AFFINITY">granularity=core,balanced</env>
36813683
<env name="OMP_STACKSIZE">128M</env>
36823684
</environment_variables>
3685+
<environment_variables DEBUG="TRUE">
3686+
<env name="RLIMITS">--rlimits CORE</env>
3687+
</environment_variables>
3688+
<resource_limits DEBUG="TRUE">
3689+
<resource name="RLIMIT_CORE">-1</resource>
3690+
</resource_limits>
36833691
<resource_limits>
36843692
<resource name="RLIMIT_STACK">-1</resource>
36853693
</resource_limits>

0 commit comments

Comments
 (0)