Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ disable = [
"invalid-name",
"no-else-return", # Bad rule IMO (- OB)
"line-too-long", # Black takes care of line length.
"logging-fstring-interpolation"
"logging-fstring-interpolation",
"duplicate-code",
]
extension-pkg-whitelist = "pydantic"

Expand Down
104 changes: 104 additions & 0 deletions sarc/alerts/usage_alerts/cluster_scraping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import logging
from datetime import datetime, timedelta
from typing import List, Optional

from sarc.config import MTL
from sarc.jobs.series import compute_time_frames, load_job_series

logger = logging.getLogger(__name__)


def check_nb_jobs_per_cluster_per_time(
time_interval: Optional[timedelta] = timedelta(days=7),
time_unit=timedelta(days=1),
cluster_names: Optional[List[str]] = None,
):
"""
Check if we have scraped enough jobs per cluster per time unit on given time interval.
Log a warning for each cluster where number of jobs is lower than a required limit
computed using mean and standard deviation statistics from clusters usage.

Parameters
----------
time_interval: timedelta
If given, only jobs which ran in [now - time_interval, time_interval] will be used for checking.
Default is last 7 days.
If None, all jobs are used.
time_unit: timedelta
Time unit in which we must check cluster usage through time_interval. Default is 1 day.
cluster_names: list
Optional list of clusters to check.
If empty (or not specified), use all clusters available among jobs retrieved with time_interval.
"""

# Parse time_interval
start, end, clip_time = None, None, False
if time_interval is not None:
end = datetime.now(tz=MTL)
start = end - time_interval
clip_time = True

# Get data frame
df = load_job_series(start=start, end=end, clip_time=clip_time)

# Split data frame into time frames using `time_unit`
tf = compute_time_frames(df, frame_size=time_unit)

# List clusters
if not cluster_names:
cluster_names = sorted(df["cluster_name"].unique())

# Generate a dataframe for stats.
f_stats = (
# Use only lines associated with given clusters
tf[tf["cluster_name"].isin(cluster_names)]
# Group by timestamp
.groupby(["timestamp"])[["job_id"]]
# And count jobs by counting column `job_id`
.count()
)
# Compute cluster usage: number of jobs per cluster per timestamp
f_stats["jobs_per_cluster"] = f_stats["job_id"] / len(cluster_names)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm donc le calcul des valeurs statistiques (moyenne, écart-type) est pour l'ensemble des clusters, et pas cluster par cluster, si je comprends bien ? Je pense que ce qu'on voulait c'est des valeurs statistiques pour chaque cluster, indépendemment. (Mais je me trompe peut-être dans la lecture du code)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

En effet, les statistiques sont actuellement calculées sur l'ensemble des clusters.

Je vais modifier le code pour les calculer cluster par cluster à travers les jours.

# Compute average cluster usage
avg = f_stats["jobs_per_cluster"].mean()
# Compute standard deviation for cluster usage
stddev = f_stats["jobs_per_cluster"].std()
# Compute threshold to use for warnings: <average> - 2 * <standard deviation>
threshold = max(0, avg - 2 * stddev)

# List to collect warnings:
reports = []
# Set of cluster-timestamp associations found while checking warnings:
founds = set()

# Check cluster usage from data frame
ff = (
tf[tf["cluster_name"].isin(cluster_names)]
.groupby(["cluster_name", "timestamp"])[["job_id"]]
.count()
)
for row in ff.itertuples():
cluster_name, timestamp = row.Index
founds.add((cluster_name, timestamp))
nb_jobs = row.job_id
if nb_jobs < threshold:
reports.append((cluster_name, timestamp, nb_jobs))

# Check cluster usage for cluster-timestamp associations not yet found in dataframe
# NB: For these cases, number of jobs is always 0
for cluster_name in cluster_names:
# Iter for each timestamp available in data frame
for timestamp in sorted(tf["timestamp"].unique()):
key = (cluster_name, timestamp)
nb_jobs = 0
if key not in founds and nb_jobs < threshold:
reports.append((cluster_name, timestamp, nb_jobs))

# Finally log warnings
if reports:
for cluster_name, timestamp, nb_jobs in reports:
logger.warning(
f"[{cluster_name}][{timestamp}] "
f"insufficient cluster scraping: {nb_jobs} jobs / cluster / time unit; "
f"minimum required: {threshold} ({avg} - 2 * {stddev}); time unit: {time_unit}"
)
128 changes: 128 additions & 0 deletions tests/functional/usage_alerts/test_alert_cluster_scraping.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import functools

import pytest

from sarc.alerts.usage_alerts.cluster_scraping import check_nb_jobs_per_cluster_per_time

from ..jobs.test_func_load_job_series import MOCK_TIME
from .common import _get_warnings

get_warnings = functools.partial(
_get_warnings,
module="sarc.alerts.usage_alerts.cluster_scraping:cluster_scraping.py",
)


@pytest.mark.freeze_time(MOCK_TIME)
@pytest.mark.usefixtures("read_only_db", "tzlocal_is_mtl")
@pytest.mark.parametrize(
"params,expected",
[
# Check with default params. In last 7 days from now (mock time: 2023-11-22),
# there is only 2 jobs from 1 cluster in 1 timestamp. So, threshold will be 0
# and no warning will be printed.
(dict(), []),
# Check with no time interval (i.e. all jobs).
# Only cluster-timestamp with 0 jobs will produce warnings.
(
dict(time_interval=None),
[
"[fromage][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[fromage][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[fromage][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[fromage][2023-02-18 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[fromage][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[fromage][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[fromage][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[mila][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[mila][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[mila][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[mila][2023-02-17 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[mila][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[patate][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[patate][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[patate][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[patate][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[patate][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[patate][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
"[raisin][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.15327969134614228 (0.90625 - 2 * 0.37648515432692886); time unit: 1 day, 0:00:00",
],
),
# Check with a supplementary cluster `another_cluster` which is not in data frame.
# As there are 1 more clusters, stats (average, std-dev, threshold) will be slightly different,
# and warnings will also include all cluster-timestamp cases for supplementary cluster.
(
dict(
time_interval=None,
cluster_names=[
"fromage",
"mila",
"patate",
"raisin",
"another_cluster",
],
),
[
"[fromage][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[fromage][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[fromage][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[fromage][2023-02-18 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[fromage][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[fromage][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[fromage][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[mila][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[mila][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[mila][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[mila][2023-02-17 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[mila][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[patate][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[patate][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[patate][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[patate][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[patate][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[patate][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[raisin][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
# warnings for another_cluster
"[another_cluster][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-17 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-18 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
"[another_cluster][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.12262375307691387 (0.7250000000000001 - 2 * 0.3011881234615431); time unit: 1 day, 0:00:00",
],
),
# Check above case with 2 clusters ignored.
# Stats will be a little different again.
(
dict(
time_interval=None,
cluster_names=[
"mila",
"raisin",
"another_cluster",
],
),
[
"[mila][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[mila][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[mila][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[mila][2023-02-17 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[mila][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[raisin][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-14 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-15 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-16 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-17 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-18 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-19 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[another_cluster][2023-02-20 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
"[another_cluster][2023-11-21 00:01:00-05:00] insufficient cluster scraping: 0 jobs / cluster / time unit; minimum required: 0.2288400740511256 (1.0833333333333333 - 2 * 0.4272466296411038); time unit: 1 day, 0:00:00",
],
),
],
)
def test_check_nb_jobs_per_cluster_per_time(params, expected, caplog):
check_nb_jobs_per_cluster_per_time(**params)
assert get_warnings(caplog.text) == expected
Loading