-
Notifications
You must be signed in to change notification settings - Fork 4
[SARC-328] Implémenter les alertes : Nombre de jobs CPU/GPU (actives ou inactives) sur un cluster sur une période X #128
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
6b576ce
e6a0fb0
d403fdb
5953724
6d4db7d
6d829bd
0520262
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,104 @@ | ||
| import logging | ||
| from datetime import datetime, timedelta | ||
| from typing import List, Optional | ||
|
|
||
| from sarc.config import MTL | ||
| from sarc.jobs.series import compute_time_frames, load_job_series | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| def check_nb_jobs_per_cluster_per_time( | ||
| time_interval: Optional[timedelta] = timedelta(days=7), | ||
| time_unit=timedelta(days=1), | ||
| cluster_names: Optional[List[str]] = None, | ||
| ): | ||
| """ | ||
| Check if we have scraped enough jobs per cluster per time unit on given time interval. | ||
| Log a warning for each cluster where number of jobs is lower than a required limit | ||
| computed using mean and standard deviation statistics from clusters usage. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| time_interval: timedelta | ||
| If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking. | ||
| Default is last 7 days. | ||
| If None, all jobs are used. | ||
| time_unit: timedelta | ||
| Time unit in which we must check cluster usage through time_interval. Default is 1 day. | ||
| cluster_names: list | ||
| Optional list of clusters to check. | ||
| If empty (or not specified), use all clusters available among jobs retrieved with time_interval. | ||
| """ | ||
|
|
||
| # Parse time_interval | ||
| start, end, clip_time = None, None, False | ||
| if time_interval is not None: | ||
| end = datetime.now(tz=MTL) | ||
| start = end - time_interval | ||
| clip_time = True | ||
|
|
||
| # Get data frame | ||
| df = load_job_series(start=start, end=end, clip_time=clip_time) | ||
|
|
||
| # Split data frame into time frames using `time_unit` | ||
| tf = compute_time_frames(df, frame_size=time_unit) | ||
|
|
||
| # List clusters | ||
| if not cluster_names: | ||
| cluster_names = sorted(df["cluster_name"].unique()) | ||
|
|
||
| # Generate a dataframe for stats. | ||
| f_stats = ( | ||
| # Use only lines associated with given clusters | ||
| tf[tf["cluster_name"].isin(cluster_names)] | ||
| # Group by timestamp | ||
| .groupby(["timestamp"])[["job_id"]] | ||
| # And count jobs by counting column `job_id` | ||
| .count() | ||
| ) | ||
| # Compute cluster usage: number of jobs per cluster per timestamp | ||
| f_stats["jobs_per_cluster"] = f_stats["job_id"] / len(cluster_names) | ||
|
||
| # Compute average cluster usage | ||
| avg = f_stats["jobs_per_cluster"].mean() | ||
| # Compute standard deviation for cluster usage | ||
| stddev = f_stats["jobs_per_cluster"].std() | ||
| # Compute threshold to use for warnings: <average> - 2 * <standard deviation> | ||
| threshold = max(0, avg - 2 * stddev) | ||
|
|
||
| # List to collect warnings: | ||
| reports = [] | ||
| # Set of cluster-timestamp associations found while checking warnings: | ||
| founds = set() | ||
|
|
||
| # Check cluster usage from data frame | ||
| ff = ( | ||
| tf[tf["cluster_name"].isin(cluster_names)] | ||
| .groupby(["cluster_name", "timestamp"])[["job_id"]] | ||
| .count() | ||
| ) | ||
| for row in ff.itertuples(): | ||
| cluster_name, timestamp = row.Index | ||
| founds.add((cluster_name, timestamp)) | ||
| nb_jobs = row.job_id | ||
| if nb_jobs < threshold: | ||
| reports.append((cluster_name, timestamp, nb_jobs)) | ||
|
|
||
| # Check cluster usage for cluster-timestamp associations not yet found in dataframe | ||
| # NB: For these cases, number of jobs is always 0 | ||
| for cluster_name in cluster_names: | ||
| # Iter for each timestamp available in data frame | ||
| for timestamp in sorted(tf["timestamp"].unique()): | ||
| key = (cluster_name, timestamp) | ||
| nb_jobs = 0 | ||
| if key not in founds and nb_jobs < threshold: | ||
| reports.append((cluster_name, timestamp, nb_jobs)) | ||
|
|
||
| # Finally log warnings | ||
| if reports: | ||
| for cluster_name, timestamp, nb_jobs in reports: | ||
| logger.warning( | ||
| f"[{cluster_name}][{timestamp}] " | ||
| f"insufficient cluster scraping: {nb_jobs} jobs / cluster / time unit; " | ||
| f"minimum required: {threshold} ({avg} - 2 * {stddev}); time unit: {time_unit}" | ||
| ) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,128 @@ | ||
| import functools | ||
|
|
||
| import pytest | ||
|
|
||
| from sarc.alerts.usage_alerts.cluster_scraping import check_nb_jobs_per_cluster_per_time | ||
|
|
||
| from ..jobs.test_func_load_job_series import MOCK_TIME | ||
| from .common import _get_warnings | ||
|
|
||
| get_warnings = functools.partial( | ||
| _get_warnings, | ||
| module="sarc.alerts.usage_alerts.cluster_scraping:cluster_scraping.py", | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.freeze_time(MOCK_TIME) | ||
| @pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl") | ||
| @pytest.mark.parametrize( | ||
| "params,expected", | ||
| [ | ||
| # Check with default params. In last 7 days from now (mock time: 2023-11-22), | ||
| # there is only 2 jobs from 1 cluster in 1 timestamp. So, threshold will be 0 | ||
| # and no warning will be printed. | ||
| (dict(), []), | ||
| # Check with no time interval (i.e. all jobs). | ||
| # Only cluster-timestamp with 0 jobs will produce warnings. | ||
| ( | ||
| dict(time_interval=None), | ||
| [ | ||
| "[fromage][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-02-18 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-17 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| "[raisin][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00", | ||
| ], | ||
| ), | ||
| # Check with a supplementary cluster `another_cluster` which is not in data frame. | ||
| # As there are 1 more clusters, stats (average, std-dev, threshold) will be slightly different, | ||
| # and warnings will also include all cluster-timestamp cases for supplementary cluster. | ||
| ( | ||
| dict( | ||
| time_interval=None, | ||
| cluster_names=[ | ||
| "fromage", | ||
| "mila", | ||
| "patate", | ||
| "raisin", | ||
| "another_cluster", | ||
| ], | ||
| ), | ||
| [ | ||
| "[fromage][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-02-18 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[fromage][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-17 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[patate][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[raisin][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| # warnings for another_cluster | ||
| "[another_cluster][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-17 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-18 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00", | ||
| ], | ||
| ), | ||
| # Check above case with 2 clusters ignored. | ||
| # Stats will be a little different again. | ||
| ( | ||
| dict( | ||
| time_interval=None, | ||
| cluster_names=[ | ||
| "mila", | ||
| "raisin", | ||
| "another_cluster", | ||
| ], | ||
| ), | ||
| [ | ||
| "[mila][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-02-17 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[mila][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[raisin][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-17 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-18 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| "[another_cluster][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00", | ||
| ], | ||
| ), | ||
| ], | ||
| ) | ||
| def test_check_nb_jobs_per_cluster_per_time(params, expected, caplog): | ||
| check_nb_jobs_per_cluster_per_time(**params) | ||
| assert get_warnings(caplog.text) == expected |
Uh oh!
There was an error while loading. Please reload this page.