Skip to content

Commit 32bb582

Browse files
committed
Merge branch 'azamat/aurora/tiles-cores-modules' (PR #7399)
Load recommended mpich-config modules and env-vars on Aurora Also, - set env-vars and export to mpiexec unlimited core file size limit in debug runs - update `jobid_pattern` to refine job-id extraction from non-standard PBS output - cleanup tabs [BFB]
2 parents 1061d4b + e072fff commit 32bb582

File tree

2 files changed

+47
-36
lines changed

2 files changed

+47
-36
lines changed

cime_config/machines/config_batch.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,7 @@
553553

554554
<batch_system MACH="aurora" type="pbspro">
555555
<batch_submit>/lus/flare/projects/E3SM_Dec/tools/qsub/throttle</batch_submit>
556+
<jobid_pattern>(\d+)\.aurora-pbs</jobid_pattern>
556557
<directives>
557558
<directive> -l filesystems=home:flare </directive>
558559
</directives>

cime_config/machines/config_machines.xml

Lines changed: 46 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3587,7 +3587,7 @@
35873587
<NODENAME_REGEX>aurora-uan-.*</NODENAME_REGEX>
35883588
<OS>LINUX</OS>
35893589
<COMPILERS>oneapi-ifxgpu,oneapi-ifx</COMPILERS>
3590-
<MPILIBS>mpich</MPILIBS>
3590+
<MPILIBS>mpich,mpich1024</MPILIBS>
35913591
<PROJECT>E3SM_Dec</PROJECT>
35923592
<SAVE_TIMING_DIR>/lus/flare/projects/E3SM_Dec/performance_archive</SAVE_TIMING_DIR>
35933593
<SAVE_TIMING_DIR_PROJECTS>.*</SAVE_TIMING_DIR_PROJECTS>
@@ -3608,66 +3608,70 @@
36083608
<MAX_MPITASKS_PER_NODE compiler="oneapi-ifxgpu">12</MAX_MPITASKS_PER_NODE>
36093609
<PROJECT_REQUIRED>FALSE</PROJECT_REQUIRED>
36103610
<mpirun mpilib="default">
3611-
<executable>mpiexec</executable>
3612-
<!--executable>numactl -m 2-3 mpiexec</executable--><!--for HBM runs-->
3613-
<arguments>
3614-
<arg name="total_num_tasks">-np {{ total_tasks }} --label</arg>
3615-
<arg name="ranks_per_node">-ppn {{ tasks_per_node }}</arg>
3616-
<arg name="ranks_bind">--cpu-bind $ENV{RANKS_BIND}</arg>
3617-
<arg name="threads_per_rank">-d $ENV{OMP_NUM_THREADS}</arg>
3618-
<arg name="gpu_maps">$ENV{GPU_TILE_COMPACT}</arg>
3619-
</arguments>
3611+
<executable>mpiexec</executable>
3612+
<!--executable>numactl -m 2-3 mpiexec</executable--><!--for HBM runs-->
3613+
<arguments>
3614+
<arg name="total_num_tasks">-np {{ total_tasks }} --label</arg>
3615+
<arg name="ranks_per_node">-ppn {{ tasks_per_node }}</arg>
3616+
<arg name="ranks_bind">--cpu-bind $ENV{RANKS_BIND}</arg>
3617+
<arg name="threads_per_rank">-d $ENV{OMP_NUM_THREADS} $ENV{RLIMITS}</arg>
3618+
<arg name="gpu_maps">$ENV{GPU_TILE_COMPACT}</arg>
3619+
</arguments>
36203620
</mpirun>
36213621
<module_system type="module" allow_error="true">
3622-
<init_path lang="sh">/usr/share/lmod/lmod/init/sh</init_path>
3623-
<init_path lang="csh">/usr/share/lmod/lmod/init/csh</init_path>
3624-
<init_path lang="python">/usr/share/lmod/lmod/init/env_modules_python.py</init_path>
3625-
<cmd_path lang="sh">module</cmd_path>
3626-
<cmd_path lang="csh">module</cmd_path>
3627-
<cmd_path lang="python">/usr/share/lmod/lmod/libexec/lmod python</cmd_path>
3628-
<modules>
3629-
<command name="load">cmake/3.30.5</command>
3630-
<command name="load">oneapi/release/2025.0.5</command>
3631-
</modules>
3632-
</module_system>
3633-
<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
3634-
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
3635-
<MAX_GB_OLD_TEST_DATA>0</MAX_GB_OLD_TEST_DATA>
3636-
<environment_variables>
3622+
<init_path lang="sh">/usr/share/lmod/lmod/init/sh</init_path>
3623+
<init_path lang="csh">/usr/share/lmod/lmod/init/csh</init_path>
3624+
<init_path lang="python">/usr/share/lmod/lmod/init/env_modules_python.py</init_path>
3625+
<cmd_path lang="sh">module</cmd_path>
3626+
<cmd_path lang="csh">module</cmd_path>
3627+
<cmd_path lang="python">/usr/share/lmod/lmod/libexec/lmod python</cmd_path>
3628+
<modules>
3629+
<command name="load">cmake/3.30.5</command>
3630+
<command name="load">oneapi/release/2025.0.5</command>
3631+
</modules>
3632+
<modules mpilib="mpich1024">
3633+
<command name="load">mpich-config/collective-tuning/1024</command>
3634+
</modules>
3635+
</module_system>
3636+
<RUNDIR>$CIME_OUTPUT_ROOT/$CASE/run</RUNDIR>
3637+
<EXEROOT>$CIME_OUTPUT_ROOT/$CASE/bld</EXEROOT>
3638+
<MAX_GB_OLD_TEST_DATA>0</MAX_GB_OLD_TEST_DATA>
3639+
<environment_variables>
36373640
<env name="NETCDF_PATH">/lus/flare/projects/E3SM_Dec/soft/netcdf/4.9.2c-4.6.1f/oneapi.eng.2024.07.30.002</env>
36383641
<env name="PNETCDF_PATH">/lus/flare/projects/E3SM_Dec/soft/pnetcdf/1.14.0/oneapi.eng.2024.07.30.002</env>
36393642
<env name="LD_LIBRARY_PATH">/lus/flare/projects/E3SM_Dec/soft/pnetcdf/1.14.0/oneapi.eng.2024.07.30.002/lib:/lus/flare/projects/E3SM_Dec/soft/netcdf/4.9.2c-4.6.1f/oneapi.eng.2024.07.30.002/lib:$ENV{LD_LIBRARY_PATH}</env>
36403643
<env name="PATH">/lus/flare/projects/E3SM_Dec/soft/pnetcdf/1.14.0/oneapi.eng.2024.07.30.002/bin:/lus/flare/projects/E3SM_Dec/soft/netcdf/4.9.2c-4.6.1f/oneapi.eng.2024.07.30.002/bin:$ENV{PATH}</env>
36413644
<env name="FI_CXI_DEFAULT_CQ_SIZE">131072</env>
36423645
<env name="FI_CXI_CQ_FILL_PERCENT">20</env>
3646+
<env name="RLIMITS"> </env>
36433647
</environment_variables>
36443648
<environment_variables compiler="oneapi-ifxgpu">
36453649
<env name="ONEAPI_DEVICE_SELECTOR">level_zero:gpu</env>
3646-
<env name="MPIR_CVAR_CH4_COLL_SELECTION_TUNING_JSON_FILE"></env>
3647-
<env name="MPIR_CVAR_COLL_SELECTION_TUNING_JSON_FILE"></env>
3648-
<env name="MPIR_CVAR_CH4_POSIX_COLL_SELECTION_TUNING_JSON_FILE"></env>
36493650
<env name="UR_L0_USE_DRIVER_INORDER_LISTS">1</env>
36503651
<env name="UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS">1</env>
36513652
<env name="UR_L0_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE">1</env>
3652-
<!--<env name="FI_PROVIDER">cxi</env>-->
3653-
<env name="FI_MR_CACHE_MONITOR">disabled</env>
3653+
<env name="FI_MR_CACHE_MONITOR">disabled</env>
36543654
<env name="FI_CXI_OVFLOW_BUF_SIZE">8388608</env>
36553655
<env name="PALS_PING_PERIOD">240</env>
36563656
<env name="PALS_RPC_TIMEOUT">240</env>
3657+
<env name="SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE">1</env>
3658+
<env name="SYCL_PI_LEVEL_ZERO_DISABLE_USM_ALLOCATOR">1</env>
3659+
<env name="SYCL_PI_LEVEL_ZERO_USM_RESIDENT">0x001</env>
3660+
<env name="UR_L0_USE_DRIVER_INORDER_LISTS">1</env>
3661+
<env name="UR_L0_USE_COPY_ENGINE_FOR_IN_ORDER_QUEUE">1</env>
36573662

3658-
<env name="MPIR_CVAR_ENABLE_GPU">1</env>
3663+
<env name="MPIR_CVAR_ENABLE_GPU">1</env>
36593664
<env name="romio_cb_read">disable</env>
36603665
<env name="romio_cb_write">disable</env>
3661-
<env name="SYCL_CACHE_PERSISTENT">1</env>
36623666
<env name="GATOR_INITIAL_MB">4000MB</env>
36633667
<env name="GATOR_DISABLE">0</env>
36643668
<env name="GPU_TILE_COMPACT">/lus/flare/projects/E3SM_Dec/tools/mpi_wrapper_utils/gpu_tile_compact.sh</env>
36653669
<env name="RANKS_BIND">list:1-8:9-16:17-24:25-32:33-40:41-48:53-60:61-68:69-76:77-84:85-92:93-100 --gpu-bind list:0.0:0.1:1.0:1.1:2.0:2.1:3.0:3.1:4.0:4.1:5.0:5.1 --mem-bind list:0:0:0:0:0:0:1:1:1:1:1:1</env>
36663670
<env name="ZES_ENABLE_SYSMAN">1</env>
3667-
<!-- default is ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE: enable this to run 4 MPI/tile or 48 MPI/node
3668-
<env name="ZEX_NUMBER_OF_CCS">0:4,1:4,2:4,3:4:4:4,5:4</env>-->
3669-
<!-- <env name="ZE_FLAT_DEVICE_HIERARCHY">FLAT</env>
3670-
<env name="ZEX_NUMBER_OF_CCS">0:4,1:4,2:4,3:4:4:4,5:4,6:4,7:4,8:4,9:4,10:4,11:4</env>-->
3671+
<!-- default is ZE_FLAT_DEVICE_HIERARCHY=COMPOSITE: enable this to run 4 MPI/tile or 48 MPI/node
3672+
<env name="ZEX_NUMBER_OF_CCS">0:4,1:4,2:4,3:4:4:4,5:4</env>-->
3673+
<!-- <env name="ZE_FLAT_DEVICE_HIERARCHY">FLAT</env>
3674+
<env name="ZEX_NUMBER_OF_CCS">0:4,1:4,2:4,3:4:4:4,5:4,6:4,7:4,8:4,9:4,10:4,11:4</env>-->
36713675
</environment_variables>
36723676
<environment_variables compiler="oneapi-ifx">
36733677
<env name="LIBOMPTARGET_DEBUG">0</env><!--default 0, max 5 -->
@@ -3680,6 +3684,12 @@
36803684
<env name="KMP_AFFINITY">granularity=core,balanced</env>
36813685
<env name="OMP_STACKSIZE">128M</env>
36823686
</environment_variables>
3687+
<environment_variables DEBUG="TRUE">
3688+
<env name="RLIMITS">--rlimits CORE</env>
3689+
</environment_variables>
3690+
<resource_limits DEBUG="TRUE">
3691+
<resource name="RLIMIT_CORE">-1</resource>
3692+
</resource_limits>
36833693
<resource_limits>
36843694
<resource name="RLIMIT_STACK">-1</resource>
36853695
</resource_limits>

0 commit comments

Comments
 (0)