Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
94 changes: 94 additions & 0 deletions sarc/alerts/usage_alerts/gpu_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import logging
from datetime import datetime, timedelta
from typing import Optional

from sarc.config import MTL
from sarc.jobs.series import load_job_series

logger = logging.getLogger(__name__)


def check_gpu_type_usage_per_node(
gpu_type: str,
time_interval: Optional[timedelta] = timedelta(hours=24),
minimum_runtime: Optional[timedelta] = timedelta(minutes=5),
threshold=1.0,
min_drac_tasks=0,
):
"""
Check if a GPU type is sufficiently used on each node.
Log a warning for each node where ratio of jobs using GPU type is lesser than given threshold.

Parameters
----------
gpu_type: str
GPU type to check.
time_interval: timedelta
If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking.
Default is last 24 hours.
If None, all jobs are used.
minimum_runtime: timedelta
If given, only jobs which ran at least for this minimum runtime will be used for checking.
Default is 5 minutes.
If None, set to 0.
threshold: float
A value between 0 and 1 to represent the minimum expected ratio of jobs that use given GPU type
wr/t running jobs on each node. Log a warning if computed ratio is lesser than this threshold.
min_drac_tasks: int
Minimum number of jobs required on a node from a DRAC cluster to make checking.
Checking won't be performed on DRAC nodes where available jobs are lesser than this number.
"""
# Parse time_interval
start, end, clip_time = None, None, False
if time_interval is not None:
end = datetime.now(tz=MTL)
start = end - time_interval
clip_time = True

# Parse minimum_runtime
if minimum_runtime is None:
minimum_runtime = timedelta(seconds=0)

# Get data frame. We clip time if start and end are available,
# so that minimum_runtime is compared to job running time in given interval.
df = load_job_series(start=start, end=end, clip_time=clip_time)

# Add a column `gpu_task_` with value 1 for each job running on given GPU type.
df.loc[:, "gpu_task_"] = df["allocated.gpu_type"] == gpu_type
# Add a column `task_` with value 1 for each job. Used later to count jobs in a groupby().
df.loc[:, "task_"] = 1

# Group jobs.
ff = (
# Select only jobs where elapsed time >= minimum runtime and gres_gpu > 0
df[
(df["elapsed_time"] >= minimum_runtime.total_seconds())
& (df["allocated.gres_gpu"] > 0)
]
# `nodes` is a list of nodes. We explode this column to count each job for each node where it is running
.explode("nodes")
# Then we group by cluster name and nodes,
.groupby(["cluster_name", "nodes"])[["gpu_task_", "task_"]]
# and we sum on gpu_task_ and task_
.sum()
)
# Finally, we compute GPU usage.
ff["gpu_usage_"] = ff["gpu_task_"] / ff["task_"]

# We can now check GPU usage.
for row in ff.itertuples():
cluster_name, node = row.Index
nb_gpu_tasks = row.gpu_task_
nb_tasks = row.task_
gpu_usage = row.gpu_usage_
if gpu_usage < threshold and (
cluster_name == "mila" or nb_tasks >= min_drac_tasks
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ça répond bien à la définition de la tâche, néanmoins le fait de hardcoder le nom du cluster "mila" dans le code comme ça me gène un peu.
Bien que je ne vois pas comment le faire proprement via les fichiers de config non plus... une idée ?

Copy link
Collaborator

@nurbal nurbal Sep 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

peut-être remplacer min_drac_taskspar min_tasks et un paramètre ignore_min_tasks_for_clusters : Optional(list(str)) = ['mila'] ? comme ça on ne met plus les références à Mila ou DRAC dans le code, mais on les déplace dans les paramètres par défaut... qu'en penses-tu ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Après ça ce sera bon pour moi

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nurbal Fait ! 4f4ee5d

):
# We warn if gpu usage < threshold and if
# either we are on MILA cluster,
# or we are on a DRAC cluster with enough jobs.
logger.warning(
f"[{cluster_name}][{node}] insufficient usage for GPU {gpu_type}: "
f"{round(gpu_usage * 100, 2)} % ({nb_gpu_tasks}/{nb_tasks}), "
f"minimum required: {round(threshold * 100, 2)} %"
)
4 changes: 3 additions & 1 deletion sarc/jobs/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,9 @@ def load_job_series(
"gpu_utilization", "cpu_utilization", "gpu_memory", "gpu_power", "system_memory"
- Optional job series fields, added if clip_time is True:
"unclipped_start" and "unclipped_end"
- Optional user info fields if job users found. See `_user_to_series` for user fields.
- Optional user info fields if job users found.
Fields from `User.dict()` in format `user.<flattened dot-separated field>`,
+ special field `user.primary_email` containing either `user.mila.email` or fallback `job.user`.
"""

# If fields is a list, convert it to a renaming dict with same old and new names.
Expand Down
Empty file.
11 changes: 11 additions & 0 deletions tests/functional/usage_alerts/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
def _get_warnings(text: str, module: str) -> list:
"""Parse warning messages from given text (typically caplog.text)"""
warnings = []
for line in text.split("\n"):
line = line.strip()
if line.startswith("WARNING "):
line_content = line[len("WARNING") :].lstrip()
line_ref, warning_msg = line_content.split(" ", maxsplit=1)
assert line_ref.startswith(f"{module}:"), line_ref
warnings.append(warning_msg.strip())
return warnings
170 changes: 170 additions & 0 deletions tests/functional/usage_alerts/test_alert_gpu_usage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
"""
Initial jobs in read_only_db (for reference):

| | job_id | cluster_name | nodes | allocated.gres_gpu | allocated.gpu_type | start_time | end_time | elapsed_time |
|---:|----------:|:---------------|:----------------------------------|---------------------:|:---------------------|:--------------------------|:--------------------------|---------------:|
| 0 | 1 | raisin | ['cn-c021'] | 1 | | 2023-02-14 00:01:00-05:00 | 2023-02-14 12:01:00-05:00 | 43200 |
| 1 | 2 | raisin | ['cn-c021'] | 1 | | 2023-02-14 06:01:00-05:00 | 2023-02-14 18:01:00-05:00 | 43200 |
| 2 | 3 | raisin | ['cn-c021'] | 1 | | 2023-02-14 12:01:00-05:00 | 2023-02-15 00:01:00-05:00 | 43200 |
| 3 | 4 | raisin | ['cn-c021'] | 1 | | 2023-02-14 18:01:00-05:00 | 2023-02-15 06:01:00-05:00 | 43200 |
| 4 | 5 | raisin | ['cn-c021'] | 1 | | 2023-02-15 00:01:00-05:00 | 2023-02-15 12:01:00-05:00 | 43200 |
| 5 | 6 | raisin | ['cn-c021'] | 1 | | 2023-02-15 06:01:00-05:00 | 2023-02-15 18:01:00-05:00 | 43200 |
| 6 | 7 | raisin | ['cn-c021'] | 1 | | 2023-11-21 07:00:00-05:00 | 2023-11-21 19:00:00-05:00 | 43200 |
| 7 | 8 | raisin | ['cn-c021'] | 1 | | 2023-11-21 07:00:00-05:00 | 2023-11-21 19:00:00-05:00 | 43200 |
| 8 | 9 | raisin | ['cn-c021'] | 1 | | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 | 43200 |
| 9 | 10 | raisin | ['cn-c021'] | 1 | | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 | 43200 |
| 10 | 11 | raisin | ['cn-c021'] | 1 | | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 | 43200 |
| 11 | 12 | raisin | ['bart'] | 1 | | 2023-02-16 18:01:00-05:00 | 2023-02-17 06:01:00-05:00 | 43200 |
| 12 | 13 | raisin | ['cn-c021', 'cn-c022', 'cn-d001'] | 1 | | 2023-02-17 00:01:00-05:00 | 2023-02-17 12:01:00-05:00 | 43200 |
| 13 | 14 | raisin | ['cn-c021'] | 1 | | 2023-02-17 06:01:00-05:00 | 2023-02-17 18:01:00-05:00 | 43200 |
| 14 | 15 | fromage | ['cn-c021'] | 1 | | 2023-02-17 12:01:00-05:00 | 2023-02-18 00:01:00-05:00 | 43200 |
| 15 | 16 | patate | ['cn-c021'] | 1 | | 2023-02-17 18:01:00-05:00 | 2023-02-18 06:01:00-05:00 | 43200 |
| 16 | 17 | raisin | ['cn-c021'] | 1 | | 2023-02-18 00:01:00-05:00 | 2023-02-18 12:01:00-05:00 | 43200 |
| 17 | 18 | raisin | ['cn-c021'] | 1 | | 2023-02-18 06:01:00-05:00 | 2023-02-18 18:01:00-05:00 | 43200 |
| 18 | 19 | mila | ['cn-c021'] | 1 | | 2023-02-18 12:01:00-05:00 | 2023-02-19 00:01:00-05:00 | 43200 |
| 19 | 20 | raisin | ['cn-c021'] | 1 | | 2023-02-18 18:01:00-05:00 | 2023-02-19 06:01:00-05:00 | 43200 |
| 20 | 1000000 | raisin | ['cn-c017'] | 1 | | 2023-02-19 00:01:00-05:00 | 2023-02-19 12:01:00-05:00 | 43200 |
| 21 | 1000000 | raisin | ['cn-b099'] | 1 | | 2023-02-19 06:01:00-05:00 | 2023-02-19 18:01:00-05:00 | 43200 |
| 22 | 23 | raisin | ['cn-c021'] | 2 | A100 | 2023-02-19 12:01:00-05:00 | 2023-02-20 00:01:00-05:00 | 43200 |
| 23 | 999999999 | mila | ['cn-c021'] | 0 | | 2023-02-19 18:01:00-05:00 | 2023-02-20 12:01:00-05:00 | 64800 |
"""

import functools
from datetime import timedelta

import pytest

from sarc.alerts.usage_alerts.gpu_usage import check_gpu_type_usage_per_node
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME

from .common import _get_warnings

get_warnings = functools.partial(
_get_warnings, module="sarc.alerts.usage_alerts.gpu_usage:gpu_usage.py"
)


@pytest.mark.freeze_time(MOCK_TIME)
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
@pytest.mark.parametrize(
"params,expected",
[
(
# Check GPU A100 with no interval (i.e. all jobs)
dict(gpu_type="A100", time_interval=None, minimum_runtime=None),
[
"[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 100.0 %",
"[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
],
),
(
# Check GPU A100 with no interval (i.e. all jobs) and minimum runtime
dict(
gpu_type="A100",
time_interval=None,
minimum_runtime=timedelta(seconds=43200),
),
[
"[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 100.0 %",
"[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %",
],
),
(
# Check GPU A100 with no interval (i.e. all jobs) and minimum runtime too high
dict(
gpu_type="A100",
time_interval=None,
minimum_runtime=timedelta(seconds=43200 + 1),
),
[],
),
(
# Check GPU A100 for all jobs with a greater threshold.
dict(
gpu_type="A100",
time_interval=None,
minimum_runtime=None,
threshold=5 / 100,
),
[
"[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
# "[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 5.0 %",
"[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
"[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %",
],
),
(
# Check GPU A100 for all jobs with threshold zero.
dict(
gpu_type="A100", time_interval=None, minimum_runtime=None, threshold=0
),
[],
),
(
# Check GPU A100 for all jobs, a greater threshold, and minimum number of jobs per drac node set to 2.
dict(
gpu_type="A100",
time_interval=None,
minimum_runtime=None,
threshold=10 / 100,
min_drac_tasks=2,
),
[
# "[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
"[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
# "[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
# "[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
# "[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
# "[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
"[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 10.0 %",
# "[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
# "[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %",
],
),
(
# Check GPU A100 with default intervals (24 hours).
# Only 2 jobs (6 and 7) will match for current frozen mock time.
dict(gpu_type="A100"),
[
"[raisin][cn-c021] insufficient usage for GPU A100: 0.0 % (0/2), minimum required: 100.0 %",
],
),
(
# Check unknown GPU.
dict(gpu_type="unknown", time_interval=None),
[
"[fromage][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[mila][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[patate][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][bart] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-b099] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c017] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/17), minimum required: 100.0 %",
"[raisin][cn-c022] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
"[raisin][cn-d001] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %",
],
),
],
)
def test_check_gpu_type_usage_per_node(params, expected, caplog):
check_gpu_type_usage_per_node(**params)
assert get_warnings(caplog.text) == expected