Skip to content

Commit 937c6eb

Browse files
authored
Merge pull request #207 from xylar/add-gpus-flag-to-job-scripts
Add gpus-per-node to job scripts and resources
2 parents e327086 + 5fb5ac1 commit 937c6eb

File tree

8 files changed

+46
-13
lines changed

8 files changed

+46
-13
lines changed

docs/developers_guide/api.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ seaice/api
231231
232232
write_job_script
233233
get_slurm_options
234+
clean_up_whitespace
234235
```
235236

236237
### logging

polaris/job/__init__.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir,
4242
cores = np.sqrt(target_cores * min_cores)
4343
nodes = int(np.ceil(cores / cores_per_node))
4444

45-
partition, qos, constraint, wall_time = get_slurm_options(
45+
partition, qos, constraint, gpus_per_node, wall_time = get_slurm_options(
4646
config, machine, nodes)
4747

4848
job_name = config.get('job', 'job_name')
@@ -58,8 +58,8 @@ def write_job_script(config, machine, target_cores, min_cores, work_dir,
5858
text = template.render(job_name=job_name, account=account,
5959
nodes=f'{nodes}', wall_time=wall_time, qos=qos,
6060
partition=partition, constraint=constraint,
61-
suite=suite)
62-
text = _clean_up_whitespace(text)
61+
gpus_per_node=gpus_per_node, suite=suite)
62+
text = clean_up_whitespace(text)
6363
if suite == '':
6464
script_filename = 'job_script.sh'
6565
else:
@@ -95,6 +95,9 @@ def get_slurm_options(config, machine, nodes):
9595
constraint : str
9696
Slurm constraint
9797
98+
gpus_per_node : str
99+
The numer of GPUs per node (if any)
100+
98101
wall_time : str
99102
Slurm wall time
100103
"""
@@ -131,12 +134,30 @@ def get_slurm_options(config, machine, nodes):
131134
else:
132135
constraint = ''
133136

137+
if config.has_option('parallel', 'gpus_per_node'):
138+
gpus_per_node = config.get('parallel', 'gpus_per_node')
139+
else:
140+
gpus_per_node = ''
141+
134142
wall_time = config.get('job', 'wall_time')
135143

136-
return partition, qos, constraint, wall_time
144+
return partition, qos, constraint, gpus_per_node, wall_time
137145

138146

139-
def _clean_up_whitespace(text):
147+
def clean_up_whitespace(text):
148+
"""
149+
Clean up whitespace after jinja templating
150+
151+
Parameters
152+
----------
153+
text : str
154+
Text to clean up
155+
156+
Returns
157+
-------
158+
text : str
159+
Text with extra blank lines removed
160+
"""
140161
prev_line = None
141162
lines = text.split('\n')
142163
trimmed = list()

polaris/job/job_script.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
{% if constraint != '' -%}
1717
#SBATCH --constraint={{ constraint }}
1818
{%- endif %}
19+
{% if gpus_per_node != '' -%}
20+
#SBATCH --gpus-per-node={{ gpus_per_node }}
21+
{%- endif %}
1922

2023
source load_polaris_env.sh
2124
polaris serial {{suite}}

polaris/machines/frontier.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ use_e3sm_hdf5_netcdf = True
4242
# some defaults
4343
[parallel]
4444

45-
# cores per node on the machine
45+
# allocatable cores per node on the machine
4646
cores_per_node = 56
4747

4848
# threads per core (set to 1 because hyperthreading requires extra sbatch

polaris/machines/pm-gpu.cfg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ use_e3sm_hdf5_netcdf = True
4242
# some defaults
4343
[parallel]
4444

45-
# cores per node on the machine
46-
cores_per_node = 128
45+
# cores per node on the machine (without hyperthreading)
46+
cores_per_node = 64
4747

4848
# threads per core (set to 1 because trying to hyperthread seems to be causing
4949
# hanging on perlmutter)

polaris/parallel.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ def get_available_parallel_resources(config):
7777
cores_per_node=cores_per_node,
7878
mpi_allowed=mpi_allowed
7979
)
80+
81+
if config.has_option('parallel', 'gpus_per_node'):
82+
available_resources['gpus_per_node'] = \
83+
config.getint('parallel', 'gpus_per_node')
84+
8085
return available_resources
8186

8287

utils/omega/ctest/job_script.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
{% if constraint != '' -%}
1717
#SBATCH --constraint={{ constraint }}
1818
{%- endif %}
19+
{% if gpus_per_node != '' -%}
20+
#SBATCH --gpus-per-node={{ gpus_per_node }}
21+
{%- endif %}
1922

2023
cd {{ build_dir }}
2124
./omega_ctest.sh

utils/omega/ctest/omega_ctest.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from polaris.config import PolarisConfigParser
1010
from polaris.io import download, update_permissions
11-
from polaris.job import _clean_up_whitespace, get_slurm_options
11+
from polaris.job import clean_up_whitespace, get_slurm_options
1212

1313

1414
def make_build_script(machine, compiler, branch, build_only, mesh_filename,
@@ -61,7 +61,7 @@ def make_build_script(machine, compiler, branch, build_only, mesh_filename,
6161
clean=clean,
6262
cmake_flags=cmake_flags)
6363

64-
script = _clean_up_whitespace(script)
64+
script = clean_up_whitespace(script)
6565

6666
build_omega_dir = os.path.abspath('build_omega')
6767
os.makedirs(build_omega_dir, exist_ok=True)
@@ -120,7 +120,7 @@ def write_job_script(config, machine, compiler, submit):
120120

121121
nodes = 1
122122

123-
partition, qos, constraint, _ = get_slurm_options(
123+
partition, qos, constraint, gpus_per_node, _ = get_slurm_options(
124124
config, machine, nodes)
125125

126126
wall_time = '0:15:00'
@@ -156,8 +156,8 @@ def write_job_script(config, machine, compiler, submit):
156156
script = template.render(job_name=job_name, account=account,
157157
nodes=f'{nodes}', wall_time=wall_time, qos=qos,
158158
partition=partition, constraint=constraint,
159-
build_dir=build_dir)
160-
script = _clean_up_whitespace(script)
159+
gpus_per_node=gpus_per_node, build_dir=build_dir)
160+
script = clean_up_whitespace(script)
161161

162162
build_omega_dir = os.path.abspath('build_omega')
163163
script_filename = f'job_build_and_ctest_omega_{machine}_{compiler}.sh'

0 commit comments

Comments
 (0)