Skip to content

Commit cf76831

Browse files
committed
update ngpus_per_node for SCREAM/E3SM on Derecho
1 parent 9ac3e71 commit cf76831

File tree

1 file changed

+59
-44
lines changed

1 file changed

+59
-44
lines changed

CIME/case/case.py

Lines changed: 59 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1569,55 +1569,70 @@ def configure(
15691569
# 3. if ngpus-per-node argument is equal to 0, it will be updated to 1 automatically.
15701570
# ----------------------------------------------------------------------------------------------------------
15711571
max_gpus_per_node = self.get_value("MAX_GPUS_PER_NODE")
1572-
if gpu_type and str(gpu_type).lower() != "none":
1573-
expect(
1574-
max_gpus_per_node,
1575-
f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
1576-
)
1577-
expect(
1578-
gpu_offload,
1579-
"Both gpu-type and gpu-offload must be defined if either is defined",
1580-
)
1581-
expect(
1582-
compiler in ["nvhpc", "cray"],
1583-
f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
1584-
)
1585-
valid_gpu_type = self.get_value("GPU_TYPE").split(",")
1586-
valid_gpu_type.remove("none")
1587-
expect(
1588-
gpu_type in valid_gpu_type,
1589-
f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
1590-
)
1591-
valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
1592-
valid_gpu_offload.remove("none")
1593-
expect(
1594-
gpu_offload in valid_gpu_offload,
1595-
f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
1596-
)
1597-
self.gpu_enabled = True
1598-
if ngpus_per_node >= 0:
1599-
self.set_value(
1600-
"NGPUS_PER_NODE",
1601-
max(1, ngpus_per_node)
1602-
if ngpus_per_node <= max_gpus_per_node
1603-
else max_gpus_per_node,
1604-
)
1605-
elif gpu_offload and str(gpu_offload).lower() != "none":
1606-
expect(
1607-
False,
1608-
"Both gpu-type and gpu-offload must be defined if either is defined",
1609-
)
1610-
elif ngpus_per_node != 0:
1611-
expect(
1612-
False,
1613-
f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
1614-
)
1615-
16161572
# Set these two GPU XML variables here to overwrite the default values
16171573
# Only set them for "cesm" model
16181574
if self._cime_model == "cesm":
1575+
if gpu_type and str(gpu_type).lower() != "none":
1576+
expect(
1577+
max_gpus_per_node,
1578+
f"GPUS are not defined for machine={machine_name} and compiler={compiler}",
1579+
)
1580+
expect(
1581+
gpu_offload,
1582+
"Both gpu-type and gpu-offload must be defined if either is defined",
1583+
)
1584+
expect(
1585+
compiler in ["nvhpc", "cray"],
1586+
f"Only nvhpc and cray compilers are expected for a GPU run; the user given compiler is {compiler}, ",
1587+
)
1588+
valid_gpu_type = self.get_value("GPU_TYPE").split(",")
1589+
valid_gpu_type.remove("none")
1590+
expect(
1591+
gpu_type in valid_gpu_type,
1592+
f"Unsupported GPU type is given: {gpu_type} ; valid values are {valid_gpu_type}",
1593+
)
1594+
valid_gpu_offload = self.get_value("GPU_OFFLOAD").split(",")
1595+
valid_gpu_offload.remove("none")
1596+
expect(
1597+
gpu_offload in valid_gpu_offload,
1598+
f"Unsupported GPU programming model is given: {gpu_offload} ; valid values are {valid_gpu_offload}",
1599+
)
1600+
self.gpu_enabled = True
1601+
if ngpus_per_node >= 0:
1602+
self.set_value(
1603+
"NGPUS_PER_NODE",
1604+
max(1, ngpus_per_node)
1605+
if ngpus_per_node <= max_gpus_per_node
1606+
else max_gpus_per_node,
1607+
)
1608+
elif gpu_offload and str(gpu_offload).lower() != "none":
1609+
expect(
1610+
False,
1611+
"Both gpu-type and gpu-offload must be defined if either is defined",
1612+
)
1613+
elif ngpus_per_node != 0:
1614+
expect(
1615+
False,
1616+
f"ngpus_per_node is expected to be 0 for a pure CPU run ; {ngpus_per_node} is provided instead ;",
1617+
)
16191618
self.set_value("GPU_TYPE", str(gpu_type).lower())
16201619
self.set_value("GPU_OFFLOAD", str(gpu_offload).lower())
1620+
else:
1621+
# Assume it is SCREAM or E3SM
1622+
if "gpu" not in compiler.lower():
1623+
expect(
1624+
ngpus_per_node == 0,
1625+
f"ngpus_per_node is expected to be 0 for a pure CPU run; {ngpus_per_node} is provided instead;",
1626+
)
1627+
else:
1628+
self.gpu_enabled = True
1629+
if ngpus_per_node >= 0:
1630+
self.set_value(
1631+
"NGPUS_PER_NODE",
1632+
max(1, ngpus_per_node)
1633+
if ngpus_per_node <= max_gpus_per_node
1634+
else max_gpus_per_node,
1635+
)
16211636

16221637
self.initialize_derived_attributes()
16231638

0 commit comments

Comments
 (0)