Skip to content

Commit d7ce299

Browse files
authoredMar 31, 2025
Adjust chpl_launchcmd to be able to handle colocales with PBS (#26850)
This PR adjusts `chpl_launchcmd`, such that it allocates correct number of nodes when `CHPL_RT_LOCALES_PER_NODE` is used. Previously, it just allocated `nl` nodes, which is not the right thing to do with colocales. This currently only fixes PBS side of the script. I'll try to fix the slurm side in this PR before merging it. Right now, I am planning this for post-release. [Reviewed by @jabraham17] Tested on a PBS system manually.
2 parents 167d50b + 5352bff commit d7ce299

File tree

1 file changed

+27
-8
lines changed

1 file changed

+27
-8
lines changed
 

‎util/test/chpl_launchcmd.py

+27-8
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def job_name(self):
199199
logging.debug('Job name prefix is: {0}'.format(prefix))
200200

201201
cmd_basename = os.path.basename(self.test_command[0])
202-
logging.debug('Test command basname: {0}'.format(cmd_basename))
202+
logging.debug('Test command basename: {0}'.format(cmd_basename))
203203

204204
job_name = '{0}-{1}'.format(prefix, cmd_basename)
205205
logging.debug('Job name is: {0}'.format(job_name))
@@ -508,7 +508,7 @@ def _launch_qsub(self, testing_dir, output_file, error_file):
508508
os.environ["LMOD_QUIET"] = "1"
509509

510510
logging.info(
511-
'Starting {0} job "{1}" on {2} nodes with walltime {3} '
511+
'Starting {0} job "{1}" on {2} locales with walltime {3} '
512512
'and output file: {4}'.format(
513513
self.submit_bin, self.job_name, self.num_locales,
514514
self.walltime, output_file))
@@ -920,16 +920,35 @@ def _qsub_command(self, output_file, error_file):
920920
# When comm=none sub_test/start_test passes -nl -1 (i.e. num locales
921921
# is -1). For the tests to work, reserve one node and the regular
922922
# ncpus (this does not happen by default).
923-
num_locales = self.num_locales
924-
if num_locales == -1:
925-
num_locales = 1
923+
num_nodes = self.num_locales
924+
if num_nodes == -1:
925+
num_nodes = 1
926+
927+
loc_per_node = int(os.environ.get('CHPL_RT_LOCALES_PER_NODE', '1'))
928+
929+
logging.debug("Locales per node: {}".format(loc_per_node))
930+
if num_nodes%loc_per_node != 0:
931+
raise RuntimeError('Requested number of locales ({}) is not '
932+
'divisible by CHPL_RT_LOCALES_PER_NODE '
933+
'({}).'.format(num_nodes, loc_per_node))
934+
935+
num_nodes = int(num_nodes/loc_per_node)
926936

927937
if self.hostlist is not None:
938+
if loc_per_node != 1:
939+
# Engin: I am not sure if this code path is still needed, nor
940+
# can't tell how to handle colocales here. So, for now, I am
941+
# just adding a warning. If this path is needed, we can make
942+
# adjustments here.
943+
logging.warning('Hostlist and the CHPL_RT_LOCALES_PER_NODE '
944+
'environment are set. You may not get correct '
945+
'number of nodes allocated')
946+
928947
# This relies on the caller to use the correct select syntax.
929948
select_stmt = select_pattern.format(self.hostlist)
930-
select_stmt = select_stmt.replace('<num_locales>', str(num_locales))
931-
elif num_locales > 0:
932-
select_stmt = select_pattern.format(num_locales)
949+
select_stmt = select_stmt.replace('<num_nodes>', str(num_nodes))
950+
elif num_nodes > 0:
951+
select_stmt = select_pattern.format(num_nodes)
933952

934953
if self.num_cpus_resource is not None:
935954
select_stmt += ':{0}={1}'.format(

0 commit comments

Comments
 (0)