Skip to content

Commit 634b894

Browse files
authored
Merge pull request #156 from mila-iqia/SARC-384-sanity-check-prometheus
[SARC-384] Sanity check: les gpu_types des fichiers de conf des clusters sont-ils les mêmes que ceux renvoyés par Prometheus ?
2 parents b6fbfe5 + 975b2a6 commit 634b894

8 files changed

+207
-0
lines changed
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import logging
2+
3+
from sarc.config import config
4+
from sarc.jobs.node_gpu_mapping import get_node_to_gpu
5+
6+
logger = logging.getLogger(__name__)
7+
8+
9+
def check_prometheus_vs_slurmconfig(cluster_name=None):
10+
"""
11+
Check if GPU types from Prometheus are the same as the ones in slurm config files.
12+
13+
To get Prometheus GPU types, we make a Prometheus query:
14+
`slurm_job_utilization_gpu_memory{cluster=<cluster name>}`,
15+
then we collect GPU types from results.
16+
17+
To get slurm config GPU types, we collect all GPU types
18+
from latest node => GPU mapping in database.
19+
Node => GPU mappings should have been collected using command line
20+
`sarc acquire slurmconfig -c <cluster name>`.
21+
22+
Parameters
23+
----------
24+
cluster_name: str
25+
Name of cluster to check. If None, all clusters are checked.
26+
"""
27+
if cluster_name is None:
28+
clusters = config().clusters.values()
29+
else:
30+
clusters = [config().clusters[cluster_name]]
31+
32+
for cluster in clusters:
33+
# We only check clusters which have a prometheus_url
34+
if not cluster.prometheus_url:
35+
continue
36+
37+
# Get slurm config GPU types from latest
38+
# node => GPU mappings stored in database
39+
slurmconfig_gpu_types = set()
40+
mapping = get_node_to_gpu(cluster.name)
41+
if mapping:
42+
for gpu_types in mapping.node_to_gpu.values():
43+
# If gpu_types is a string, split it on commas
44+
if isinstance(gpu_types, str):
45+
gpu_types = gpu_types.split(",")
46+
# If a GPU type is in format gpu:<name>:<count>, get <name>
47+
for gpu_type in gpu_types:
48+
if gpu_type.startswith("gpu:") and gpu_type.count(":") == 2:
49+
_, gpu_type, _ = gpu_type.split(":")
50+
slurmconfig_gpu_types.add(gpu_type)
51+
if not slurmconfig_gpu_types:
52+
# Warn if there is no slurm config GPUs available.
53+
logger.warning(
54+
f"[prometheus][{cluster.name}] cannot find GPU types from slurm config file. "
55+
f"You may need to call `sarc acquire slurmconfig -c {cluster.name}`"
56+
)
57+
else:
58+
# Get Prometheus GPU types
59+
query = f'slurm_job_utilization_gpu_memory{{cluster="{cluster.name}"}}'
60+
results = cluster.prometheus.custom_query(query)
61+
prometheus_gpu_types = {result["metric"]["gpu_type"] for result in results}
62+
63+
# Warn for each prometheus GPU not found in slurm config GPUs.
64+
only_in_prometheus = prometheus_gpu_types - slurmconfig_gpu_types
65+
for gpu_type in only_in_prometheus:
66+
logger.warning(
67+
f"[prometheus][{cluster.name}] gpu_type not found in slurm config file: {gpu_type}. "
68+
f"Expected: {', '.join(sorted(slurmconfig_gpu_types))}"
69+
)
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
import re
2+
from datetime import datetime
3+
from unittest.mock import MagicMock
4+
5+
import pytest
6+
7+
from sarc.alerts.usage_alerts.prometheus_gpu_types import (
8+
check_prometheus_vs_slurmconfig,
9+
)
10+
from sarc.config import config
11+
12+
TESTING_DATA = {
13+
"00_hyrule": {
14+
"cluster": "hyrule",
15+
"message": "No node_to_gpu, so no slurm config data: should complain.",
16+
"node_to_gpu": {},
17+
"prometheus": [],
18+
},
19+
"01_gerudo": {
20+
"cluster": "gerudo",
21+
"message": "No node_to_gpu, so no slurm config data: should complain.",
22+
"node_to_gpu": {},
23+
"prometheus": [{"metric": {"gpu_type": "phantom_gpu"}}],
24+
},
25+
"10_patate": {
26+
"cluster": "patate",
27+
"message": "No prometheus data to check: no warning.",
28+
"node_to_gpu": {"node0": "gpu0"},
29+
"prometheus": [],
30+
},
31+
"11_0_fromage": {
32+
"cluster": "fromage",
33+
"message": "Both slurm config data and prometheus data available, but prometheus GPU not in slurm config: warning.",
34+
"node_to_gpu": {"node0": "gpu0"},
35+
"prometheus": [{"metric": {"gpu_type": "phantom_gpu"}}],
36+
},
37+
"11_1_raisin": {
38+
"cluster": "raisin",
39+
"message": "Both slurm config data and prometheus data available, and prometheus GPU in slurm config: no warning.",
40+
"node_to_gpu": {"node0": "phantom_gpu"},
41+
"prometheus": [{"metric": {"gpu_type": "phantom_gpu"}}],
42+
},
43+
}
44+
45+
46+
@pytest.mark.usefixtures("empty_read_write_db", "tzlocal_is_mtl")
47+
@pytest.mark.parametrize("params", TESTING_DATA.values(), ids=TESTING_DATA.keys())
48+
def test_check_prometheus_vs_slurmconfig(params, monkeypatch, caplog, file_regression):
49+
"""Test each case from TEST_DATA (one test per cluster)."""
50+
51+
from prometheus_api_client import PrometheusConnect
52+
53+
# Mock PrometheusConnect.custom_query() to prevent a real call to Prometheus
54+
monkeypatch.setattr(
55+
PrometheusConnect,
56+
"custom_query",
57+
MagicMock(return_value=params["prometheus"]),
58+
)
59+
# Add node_to_gpu entry in db if necessary
60+
if params["node_to_gpu"]:
61+
db = config().mongo.database_instance
62+
db.node_gpu_mapping.insert_one(
63+
{
64+
"cluster_name": params["cluster"],
65+
"since": datetime.now(),
66+
"node_to_gpu": params["node_to_gpu"],
67+
}
68+
)
69+
70+
check_prometheus_vs_slurmconfig(cluster_name=params["cluster"])
71+
file_regression.check(
72+
params["message"]
73+
+ "\n\n"
74+
+ re.sub(
75+
r"WARNING +sarc\.alerts\.usage_alerts\.prometheus_gpu_types:prometheus_gpu_types.py:[0-9]+ +",
76+
"",
77+
caplog.text,
78+
)
79+
)
80+
81+
82+
@pytest.mark.usefixtures("empty_read_write_db", "tzlocal_is_mtl")
83+
def test_check_prometheus_vs_slurmconfig_all(monkeypatch, caplog, file_regression):
84+
"""Test all data at once (all clusters)."""
85+
86+
from prometheus_api_client import PrometheusConnect
87+
88+
def _gen_fake_custom_query(self_, query_: str):
89+
return [
90+
prom_data
91+
for params in TESTING_DATA.values()
92+
if params["cluster"] in query_
93+
for prom_data in params["prometheus"]
94+
]
95+
96+
monkeypatch.setattr(
97+
PrometheusConnect,
98+
"custom_query",
99+
_gen_fake_custom_query,
100+
)
101+
102+
db = config().mongo.database_instance
103+
collection = db.node_gpu_mapping
104+
for params in TESTING_DATA.values():
105+
if params["node_to_gpu"]:
106+
collection.insert_one(
107+
{
108+
"cluster_name": params["cluster"],
109+
"since": datetime.now(),
110+
"node_to_gpu": params["node_to_gpu"],
111+
}
112+
)
113+
114+
check_prometheus_vs_slurmconfig()
115+
file_regression.check(
116+
re.sub(
117+
r"WARNING +sarc\.alerts\.usage_alerts\.prometheus_gpu_types:prometheus_gpu_types.py:[0-9]+ +",
118+
"",
119+
caplog.text,
120+
)
121+
)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
No node_to_gpu, so no slurm config data: should complain.
2+
3+
[prometheus][hyrule] cannot find GPU types from slurm config file. You may need to call `sarc acquire slurmconfig -c hyrule`
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
No node_to_gpu, so no slurm config data: should complain.
2+
3+
[prometheus][gerudo] cannot find GPU types from slurm config file. You may need to call `sarc acquire slurmconfig -c gerudo`
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
No prometheus data to check: no warning.
2+
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Both slurm config data and prometheus data available, but prometheus GPU not in slurm config: warning.
2+
3+
[prometheus][fromage] gpu_type not found in slurm config file: phantom_gpu. Expected: gpu0
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Both slurm config data and prometheus data available, and prometheus GPU in slurm config: no warning.
2+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[prometheus][fromage] gpu_type not found in slurm config file: phantom_gpu. Expected: gpu0
2+
[prometheus][gerudo] cannot find GPU types from slurm config file. You may need to call `sarc acquire slurmconfig -c gerudo`
3+
[prometheus][hyrule] cannot find GPU types from slurm config file. You may need to call `sarc acquire slurmconfig -c hyrule`
4+
[prometheus][mila] cannot find GPU types from slurm config file. You may need to call `sarc acquire slurmconfig -c mila`

0 commit comments

Comments
 (0)