Skip to content

Commit faf9e42

Browse files
authored
Merge pull request #208 from notoraptor/cw-556-cluster-state-per-day
[cw-556] add a script to generate cluster status
2 parents a918935 + 0e9eeaa commit faf9e42

File tree

3 files changed

+113
-36
lines changed

3 files changed

+113
-36
lines changed

clockwork_web/core/users_helper.py

Lines changed: 24 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,12 @@ def render_template_with_user_settings(template_name_or_list, **context):
581581
context["web_settings_json_str"] = json.dumps(context["web_settings"])
582582

583583
# Send the clusters infos to the template
584-
context["clusters"] = get_all_clusters()
584+
# NB: get_all_clusters() seems to return the clusters dict itself
585+
# from config, not a copy. So, any modification on clusters dict
586+
# returned by this function will be propagated into config.
587+
# As we don't want this behaviour here, we will make a copy
588+
# of each cluster dict.
589+
context["clusters"] = {k: v.copy() for k, v in get_all_clusters().items()}
585590
# List clusters available for connected user,
586591
# or set an empty list for anon user.
587592
context["user_clusters"] = (
@@ -591,40 +596,28 @@ def render_template_with_user_settings(template_name_or_list, **context):
591596
)
592597

593598
# Get cluster status (if jobs are old and cluster has error).
594-
"""
595599
for cluster_name in context["clusters"]:
596-
# Cluster error cannot yet be checked, so
597-
# cluster_has_error is always False for now.
598-
cluster_has_error = False
599-
context["clusters"][cluster_name]["status"] = {
600-
"jobs_are_old": _jobs_are_old(cluster_name),
601-
"cluster_has_error": cluster_has_error,
602-
}
603-
"""
600+
context["clusters"][cluster_name]["status"] = _get_cluster_status(cluster_name)
604601

605602
return render_template(template_name_or_list, **context)
606603

607604

608-
def _jobs_are_old(cluster_name):
609-
"""Return True if last slurm update in given cluster is older than 2 days."""
610-
jobs_are_old = False
605+
def _get_cluster_status(cluster_name):
606+
"""
607+
Get cluster status from DB collection `cluster_status`.
611608
612-
mongodb_filter = {"slurm.cluster_name": cluster_name}
609+
Collection should be updated from an independent script
610+
(`scripts/update_clusters_status.py`) regularly.
611+
"""
613612
mc = get_db()
614-
job_with_max_cw_last_slurm_update = list(
615-
mc["jobs"].find(mongodb_filter).sort([("cw.last_slurm_update", -1)]).limit(1)
616-
)
617-
618-
if job_with_max_cw_last_slurm_update:
619-
(job,) = job_with_max_cw_last_slurm_update
620-
if "last_slurm_update" in job["cw"]:
621-
most_recent_job_edition = job["cw"]["last_slurm_update"]
622-
current_timestamp = datetime.now().timestamp()
623-
elapsed_time = timedelta(
624-
seconds=current_timestamp - most_recent_job_edition
625-
)
626-
# Let's say the latest jobs edition must not be older than max_delay.
627-
max_delay = timedelta(days=2)
628-
jobs_are_old = elapsed_time > max_delay
629-
630-
return jobs_are_old
613+
statuses = list(mc["cluster_status"].find({"cluster_name": cluster_name}))
614+
if statuses:
615+
# Status found
616+
(status,) = statuses
617+
return status
618+
else:
619+
# No status found, return default values
620+
return {
621+
"jobs_are_old": False,
622+
"cluster_has_error": False,
623+
}

clockwork_web/templates/base.html

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -232,17 +232,17 @@ <h1><a data-bs-toggle="collapse" data-bs-target=".formCollapse" aria-expanded="f
232232
<i class="fa-solid fa-file-lines" data-bs-toggle="tooltip" data-bs-placement="right" title="{{ gettext('Grafana cluster link') }}"></i>
233233
</a>
234234
<!-- cluster status -->
235-
<!--
236-
{# if D_cluster['status']['jobs_are_old'] #}
235+
{% if D_cluster['status']['jobs_are_old'] %}
237236
<span class="cluster-info cluster-warning">
238237
<i class="fa-solid fa-triangle-exclamation" data-bs-toggle="tooltip" data-bs-placement="right" title="{{ gettext('Possible stale jobs. Most recent update was more than 30 days ago') }}"></i>
239238
</span>
240-
241-
{# else #}
239+
240+
{% else %}
242241
<span class="cluster-info cluster-good">
243242
<i class="fa-solid fa-ballot-check" data-bs-toggle="tooltip" data-bs-placement="right" title="{{ gettext('Latest update to jobs is relatively recent (at most 30 days ago)') }}"></i>
244243
</span>
245-
{# endif #}
244+
{% endif %}
245+
<!-- `cluster_has_error` not yet updated, thus still not displayed
246246
{# if D_cluster['status']['cluster_has_error'] #}
247247
<span class="cluster-info cluster-error">
248248
<i class="fa-solid fa-octagon-exclamation" data-bs-toggle="tooltip" data-bs-placement="right" title="{{ gettext('A cluster error occurred recently') }}"></i>

scripts/update_clusters_status.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
from datetime import datetime, timedelta
2+
3+
from clockwork_web.config import register_config
4+
from clockwork_web.core.clusters_helper import get_all_clusters
5+
from slurm_state.mongo_client import get_mongo_client
6+
from slurm_state.config import get_config
7+
8+
9+
def main():
10+
# Register the elements to access the database
11+
register_config("mongo.connection_string", "")
12+
register_config("mongo.database_name", "clockwork")
13+
14+
# Get database and collection objects
15+
client = get_mongo_client()
16+
db_insertion_point = client[get_config("mongo.database_name")]
17+
collection_name = "cluster_status"
18+
collection = db_insertion_point[collection_name]
19+
20+
# Get clusters
21+
clusters = get_all_clusters()
22+
23+
# Generate clusters statuses
24+
cluster_to_status = []
25+
for cluster_name in clusters:
26+
# Cluster error cannot yet be checked, so
27+
# cluster_has_error is always False for now.
28+
cluster_has_error = False
29+
cluster_to_status.append(
30+
{
31+
"cluster_name": cluster_name,
32+
"jobs_are_old": _jobs_are_old(db_insertion_point, cluster_name),
33+
"cluster_has_error": cluster_has_error,
34+
}
35+
)
36+
37+
# Create collection index if necessary
38+
if not list(collection.list_indexes()):
39+
print("Create index for collection:", collection_name)
40+
collection.create_index(
41+
[
42+
("cluster_name", 1),
43+
("jobs_are_old", 1),
44+
("cluster_has_error", 1),
45+
],
46+
name="cluster_status_index",
47+
)
48+
# Save clusters statuses in database
49+
for cluster_status in cluster_to_status:
50+
collection.update_one(
51+
{"cluster_name": cluster_status["cluster_name"]},
52+
{"$set": cluster_status},
53+
upsert=True,
54+
)
55+
56+
print("Updated.")
57+
58+
59+
def _jobs_are_old(mc, cluster_name):
60+
"""Return True if last slurm update in given cluster is older than 2 days."""
61+
jobs_are_old = False
62+
63+
mongodb_filter = {"slurm.cluster_name": cluster_name}
64+
job_with_max_cw_last_slurm_update = list(
65+
mc["jobs"].find(mongodb_filter).sort([("cw.last_slurm_update", -1)]).limit(1)
66+
)
67+
68+
if job_with_max_cw_last_slurm_update:
69+
(job,) = job_with_max_cw_last_slurm_update
70+
if "last_slurm_update" in job["cw"]:
71+
most_recent_job_edition = job["cw"]["last_slurm_update"]
72+
current_timestamp = datetime.now().timestamp()
73+
elapsed_time = timedelta(
74+
seconds=current_timestamp - most_recent_job_edition
75+
)
76+
# Let's say the latest jobs edition must not be older than max_delay.
77+
max_delay = timedelta(days=2)
78+
jobs_are_old = elapsed_time > max_delay
79+
80+
return jobs_are_old
81+
82+
83+
if __name__ == "__main__":
84+
main()

0 commit comments

Comments
 (0)