diff --git a/sarc/alerts/usage_alerts/__init__.py b/sarc/alerts/usage_alerts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/sarc/alerts/usage_alerts/gpu_usage.py b/sarc/alerts/usage_alerts/gpu_usage.py new file mode 100644 index 00000000..8507f74c --- /dev/null +++ b/sarc/alerts/usage_alerts/gpu_usage.py @@ -0,0 +1,99 @@ +import logging +from datetime import datetime, timedelta +from typing import Optional, Sequence + +from sarc.config import MTL +from sarc.jobs.series import load_job_series + +logger = logging.getLogger(__name__) + + +def check_gpu_type_usage_per_node( + gpu_type: str, + time_interval: Optional[timedelta] = timedelta(hours=24), + minimum_runtime: Optional[timedelta] = timedelta(minutes=5), + threshold=1.0, + min_tasks=0, + ignore_min_tasks_for_clusters: Optional[Sequence[str]] = ("mila",), +): + """ + Check if a GPU type is sufficiently used on each node. + Log a warning for each node where ratio of jobs using GPU type is lesser than given threshold. + + Parameters + ---------- + gpu_type: str + GPU type to check. + time_interval: timedelta + If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking. + Default is last 24 hours. + If None, all jobs are used. + minimum_runtime: timedelta + If given, only jobs which ran at least for this minimum runtime will be used for checking. + Default is 5 minutes. + If None, set to 0. + threshold: float + A value between 0 and 1 to represent the minimum expected ratio of jobs that use given GPU type + wr/t running jobs on each node. Log a warning if computed ratio is lesser than this threshold. + min_tasks: int + Minimum number of jobs required on a cluster node to make checking. + Checking is performed on a node only if, either it contains at least `min_tasks` jobs, + or node cluster is in `ignore_min_tasks_for_clusters`. + ignore_min_tasks_for_clusters: Sequence + Clusters to check even if nodes from those clusters don't have `min_tasks` jobs. + """ + # Parse time_interval + start, end, clip_time = None, None, False + if time_interval is not None: + end = datetime.now(tz=MTL) + start = end - time_interval + clip_time = True + + # Parse minimum_runtime + if minimum_runtime is None: + minimum_runtime = timedelta(seconds=0) + + # Get data frame. We clip time if start and end are available, + # so that minimum_runtime is compared to job running time in given interval. + df = load_job_series(start=start, end=end, clip_time=clip_time) + + # Add a column `gpu_task_` with value 1 for each job running on given GPU type. + df.loc[:, "gpu_task_"] = df["allocated.gpu_type"] == gpu_type + # Add a column `task_` with value 1 for each job. Used later to count jobs in a groupby(). + df.loc[:, "task_"] = 1 + + # Group jobs. + ff = ( + # Select only jobs where elapsed time >= minimum runtime and gres_gpu > 0 + df[ + (df["elapsed_time"] >= minimum_runtime.total_seconds()) + & (df["allocated.gres_gpu"] > 0) + ] + # `nodes` is a list of nodes. We explode this column to count each job for each node where it is running + .explode("nodes") + # Then we group by cluster name and nodes, + .groupby(["cluster_name", "nodes"])[["gpu_task_", "task_"]] + # and we sum on gpu_task_ and task_ + .sum() + ) + # Finally, we compute GPU usage. + ff["gpu_usage_"] = ff["gpu_task_"] / ff["task_"] + + # We can now check GPU usage. + ignore_min_tasks_for_clusters = set(ignore_min_tasks_for_clusters or ()) + for row in ff.itertuples(): + cluster_name, node = row.Index + nb_gpu_tasks = row.gpu_task_ + nb_tasks = row.task_ + gpu_usage = row.gpu_usage_ + if gpu_usage < threshold and ( + cluster_name in ignore_min_tasks_for_clusters or nb_tasks >= min_tasks + ): + # We warn if gpu usage < threshold and if + # either we are on a cluster listed in `ignore_min_tasks_for_clusters`, + # or there are enough jobs in node. + logger.warning( + f"[{cluster_name}][{node}] insufficient usage for GPU {gpu_type}: " + f"{round(gpu_usage * 100, 2)} % ({nb_gpu_tasks}/{nb_tasks}), " + f"minimum required: {round(threshold * 100, 2)} %" + ) diff --git a/sarc/jobs/series.py b/sarc/jobs/series.py index 51b8aba5..6b9754a3 100644 --- a/sarc/jobs/series.py +++ b/sarc/jobs/series.py @@ -321,7 +321,9 @@ def load_job_series( "gpu_utilization", "cpu_utilization", "gpu_memory", "gpu_power", "system_memory" - Optional job series fields, added if clip_time is True: "unclipped_start" and "unclipped_end" - - Optional user info fields if job users found. See `_user_to_series` for user fields. + - Optional user info fields if job users found. + Fields from `User.dict()` in format `user.`, + + special field `user.primary_email` containing either `user.mila.email` or fallback `job.user`. """ # If fields is a list, convert it to a renaming dict with same old and new names. diff --git a/tests/functional/usage_alerts/__init__.py b/tests/functional/usage_alerts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/functional/usage_alerts/common.py b/tests/functional/usage_alerts/common.py new file mode 100644 index 00000000..0725a65b --- /dev/null +++ b/tests/functional/usage_alerts/common.py @@ -0,0 +1,11 @@ +def _get_warnings(text: str, module: str) -> list: + """Parse warning messages from given text (typically caplog.text)""" + warnings = [] + for line in text.split("\n"): + line = line.strip() + if line.startswith("WARNING "): + line_content = line[len("WARNING") :].lstrip() + line_ref, warning_msg = line_content.split(" ", maxsplit=1) + assert line_ref.startswith(f"{module}:"), line_ref + warnings.append(warning_msg.strip()) + return warnings diff --git a/tests/functional/usage_alerts/test_alert_gpu_usage.py b/tests/functional/usage_alerts/test_alert_gpu_usage.py new file mode 100644 index 00000000..41f148d2 --- /dev/null +++ b/tests/functional/usage_alerts/test_alert_gpu_usage.py @@ -0,0 +1,170 @@ +""" +Initial jobs in read_only_db (for reference): + +| | job_id | cluster_name | nodes | allocated.gres_gpu | allocated.gpu_type | start_time | end_time | elapsed_time | +|---:|----------:|:---------------|:----------------------------------|---------------------:|:---------------------|:--------------------------|:--------------------------|---------------:| +| 0 | 1 | raisin | ['cn-c021'] | 1 | | 2023-02-14 00:01:00-05:00 | 2023-02-14 12:01:00-05:00 | 43200 | +| 1 | 2 | raisin | ['cn-c021'] | 1 | | 2023-02-14 06:01:00-05:00 | 2023-02-14 18:01:00-05:00 | 43200 | +| 2 | 3 | raisin | ['cn-c021'] | 1 | | 2023-02-14 12:01:00-05:00 | 2023-02-15 00:01:00-05:00 | 43200 | +| 3 | 4 | raisin | ['cn-c021'] | 1 | | 2023-02-14 18:01:00-05:00 | 2023-02-15 06:01:00-05:00 | 43200 | +| 4 | 5 | raisin | ['cn-c021'] | 1 | | 2023-02-15 00:01:00-05:00 | 2023-02-15 12:01:00-05:00 | 43200 | +| 5 | 6 | raisin | ['cn-c021'] | 1 | | 2023-02-15 06:01:00-05:00 | 2023-02-15 18:01:00-05:00 | 43200 | +| 6 | 7 | raisin | ['cn-c021'] | 1 | | 2023-11-21 07:00:00-05:00 | 2023-11-21 19:00:00-05:00 | 43200 | +| 7 | 8 | raisin | ['cn-c021'] | 1 | | 2023-11-21 07:00:00-05:00 | 2023-11-21 19:00:00-05:00 | 43200 | +| 8 | 9 | raisin | ['cn-c021'] | 1 | | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 | 43200 | +| 9 | 10 | raisin | ['cn-c021'] | 1 | | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 | 43200 | +| 10 | 11 | raisin | ['cn-c021'] | 1 | | 2023-02-16 00:01:00-05:00 | 2023-02-16 12:01:00-05:00 | 43200 | +| 11 | 12 | raisin | ['bart'] | 1 | | 2023-02-16 18:01:00-05:00 | 2023-02-17 06:01:00-05:00 | 43200 | +| 12 | 13 | raisin | ['cn-c021', 'cn-c022', 'cn-d001'] | 1 | | 2023-02-17 00:01:00-05:00 | 2023-02-17 12:01:00-05:00 | 43200 | +| 13 | 14 | raisin | ['cn-c021'] | 1 | | 2023-02-17 06:01:00-05:00 | 2023-02-17 18:01:00-05:00 | 43200 | +| 14 | 15 | fromage | ['cn-c021'] | 1 | | 2023-02-17 12:01:00-05:00 | 2023-02-18 00:01:00-05:00 | 43200 | +| 15 | 16 | patate | ['cn-c021'] | 1 | | 2023-02-17 18:01:00-05:00 | 2023-02-18 06:01:00-05:00 | 43200 | +| 16 | 17 | raisin | ['cn-c021'] | 1 | | 2023-02-18 00:01:00-05:00 | 2023-02-18 12:01:00-05:00 | 43200 | +| 17 | 18 | raisin | ['cn-c021'] | 1 | | 2023-02-18 06:01:00-05:00 | 2023-02-18 18:01:00-05:00 | 43200 | +| 18 | 19 | mila | ['cn-c021'] | 1 | | 2023-02-18 12:01:00-05:00 | 2023-02-19 00:01:00-05:00 | 43200 | +| 19 | 20 | raisin | ['cn-c021'] | 1 | | 2023-02-18 18:01:00-05:00 | 2023-02-19 06:01:00-05:00 | 43200 | +| 20 | 1000000 | raisin | ['cn-c017'] | 1 | | 2023-02-19 00:01:00-05:00 | 2023-02-19 12:01:00-05:00 | 43200 | +| 21 | 1000000 | raisin | ['cn-b099'] | 1 | | 2023-02-19 06:01:00-05:00 | 2023-02-19 18:01:00-05:00 | 43200 | +| 22 | 23 | raisin | ['cn-c021'] | 2 | A100 | 2023-02-19 12:01:00-05:00 | 2023-02-20 00:01:00-05:00 | 43200 | +| 23 | 999999999 | mila | ['cn-c021'] | 0 | | 2023-02-19 18:01:00-05:00 | 2023-02-20 12:01:00-05:00 | 64800 | +""" + +import functools +from datetime import timedelta + +import pytest + +from sarc.alerts.usage_alerts.gpu_usage import check_gpu_type_usage_per_node +from tests.functional.jobs.test_func_load_job_series import MOCK_TIME + +from .common import _get_warnings + +get_warnings = functools.partial( + _get_warnings, module="sarc.alerts.usage_alerts.gpu_usage:gpu_usage.py" +) + + +@pytest.mark.freeze_time(MOCK_TIME) +@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") +@pytest.mark.parametrize( + "params,expected", + [ + ( + # Check GPU A100 with no interval (i.e. all jobs) + dict(gpu_type="A100", time_interval=None, minimum_runtime=None), + [ + "[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 100.0 %", + "[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + ], + ), + ( + # Check GPU A100 with no interval (i.e. all jobs) and minimum runtime + dict( + gpu_type="A100", + time_interval=None, + minimum_runtime=timedelta(seconds=43200), + ), + [ + "[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 100.0 %", + "[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 100.0 %", + ], + ), + ( + # Check GPU A100 with no interval (i.e. all jobs) and minimum runtime too high + dict( + gpu_type="A100", + time_interval=None, + minimum_runtime=timedelta(seconds=43200 + 1), + ), + [], + ), + ( + # Check GPU A100 for all jobs with a greater threshold. + dict( + gpu_type="A100", + time_interval=None, + minimum_runtime=None, + threshold=5 / 100, + ), + [ + "[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %", + "[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %", + "[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %", + "[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %", + "[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %", + "[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %", + # "[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 5.0 %", + "[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %", + "[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 5.0 %", + ], + ), + ( + # Check GPU A100 for all jobs with threshold zero. + dict( + gpu_type="A100", time_interval=None, minimum_runtime=None, threshold=0 + ), + [], + ), + ( + # Check GPU A100 for all jobs, a greater threshold, and minimum number of jobs per drac node set to 2. + dict( + gpu_type="A100", + time_interval=None, + minimum_runtime=None, + threshold=10 / 100, + min_tasks=2, + ), + [ + # "[fromage][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %", + "[mila][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %", + # "[patate][cn-c021] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %", + # "[raisin][bart] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %", + # "[raisin][cn-b099] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %", + # "[raisin][cn-c017] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %", + "[raisin][cn-c021] insufficient usage for GPU A100: 5.88 % (1/17), minimum required: 10.0 %", + # "[raisin][cn-c022] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %", + # "[raisin][cn-d001] insufficient usage for GPU A100: 0.0 % (0/1), minimum required: 10.0 %", + ], + ), + ( + # Check GPU A100 with default intervals (24 hours). + # Only 2 jobs (6 and 7) will match for current frozen mock time. + dict(gpu_type="A100"), + [ + "[raisin][cn-c021] insufficient usage for GPU A100: 0.0 % (0/2), minimum required: 100.0 %", + ], + ), + ( + # Check unknown GPU. + dict(gpu_type="unknown", time_interval=None), + [ + "[fromage][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %", + "[mila][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %", + "[patate][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][bart] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-b099] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-c017] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-c021] insufficient usage for GPU unknown: 0.0 % (0/17), minimum required: 100.0 %", + "[raisin][cn-c022] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %", + "[raisin][cn-d001] insufficient usage for GPU unknown: 0.0 % (0/1), minimum required: 100.0 %", + ], + ), + ], +) +def test_check_gpu_type_usage_per_node(params, expected, caplog): + check_gpu_type_usage_per_node(**params) + assert get_warnings(caplog.text) == expected