mila-iqia · nurbal · Oct 11, 2024 · Oct 14, 2024 · Oct 15, 2024 · Oct 25, 2024
diff --git a/config/sarc-prod.yaml b/config/sarc-prod.yaml
@@ -19,7 +19,7 @@ sarc:
   sshconfig: "~/.ssh/config"
   clusters:
     mila:
-      host: mila
+      host: localhost
       timezone: America/Montreal
       accounts: 
       sacct_bin: "/opt/slurm/bin/sacct"
@@ -31,7 +31,7 @@ sarc:
       start_date: '2022-04-01'
       billing_is_gpu: true
     narval:
-      host: narval.computecanada.ca
+      host: robot.narval.alliancecan.ca
       timezone: America/Montreal
       accounts:
       - rrg-bengioy-ad_gpu
@@ -44,12 +44,12 @@ sarc:
       duc_storage_command: duc ls -d /project/.duc_databases/rrg-bengioy-ad.sqlite /project/rrg-bengioy-ad
       diskusage_report_command: diskusage_report --project --all_users
       prometheus_url: https://mila-thanos.calculquebec.ca
-      prometheus_headers_file: ../../SARC_secrets/secrets/drac_prometheus/headers.json
+      prometheus_headers_file: ../SARC_secrets/secrets/drac_prometheus/headers.json
       start_date: '2022-04-01'
       rgu_start_date: '2023-11-28'
       gpu_to_rgu_billing: ../../SARC_secrets/secrets/gpu_to_rgu_billing_narval.json
     beluga:
-      host: beluga.computecanada.ca
+      host: robot.beluga.alliancecan.ca
       timezone: America/Montreal
       accounts:
       - rrg-bengioy-ad_gpu
@@ -62,12 +62,12 @@ sarc:
       duc_storage_command: duc ls -d /project/.duc_databases/rrg-bengioy-ad.sqlite /project/rrg-bengioy-ad
       diskusage_report_command: diskusage_report --project --all_users
       prometheus_url: https://mila-thanos.calculquebec.ca
-      prometheus_headers_file: ../../SARC_secrets/secrets/drac_prometheus/headers.json
+      prometheus_headers_file: ../SARC_secrets/secrets/drac_prometheus/headers.json
       start_date: '2022-04-01'
       rgu_start_date: '2024-04-03'
       gpu_to_rgu_billing: ../../SARC_secrets/secrets/gpu_to_rgu_billing_beluga.json
     graham:
-      host: graham.computecanada.ca
+      host: robot.graham.alliancecan.ca
       timezone: America/Toronto
       accounts:
       - rrg-bengioy-ad_gpu
@@ -84,7 +84,7 @@ sarc:
       rgu_start_date: '2024-04-03'
       gpu_to_rgu_billing: ../../SARC_secrets/secrets/gpu_to_rgu_billing_graham.json
     cedar:
-      host: cedar.computecanada.ca
+      host: robot.cedar.alliancecan.ca
       timezone: America/Vancouver
       accounts:
       - rrg-bengioy-ad_gpu

diff --git a/sarc/account_matching/make_matches.py b/sarc/account_matching/make_matches.py
@@ -334,19 +334,24 @@ def _manual_matching(DLD_data, DD_persons, override_matches_mila_to_cc):
             drac_account_username,
         ) in override_matches_mila_to_cc.items():
             if mila_email_username not in DD_persons:
-                raise ValueError(
-                    f'"{mila_email_username}" is not found in the actual sources.'
-                    "This was supplied to `override_matches_mila_to_cc` in the `make_matches.py` file, "
+                msg = (
+                    f'"{mila_email_username}" is not found in the actual sources.\n'
+                    f"This was supplied to `override_matches_mila_to_cc` in the `make_matches.py` file, "
                     f"but there are not such entries in LDAP.\n"
-                    "Someone messed up the manual matching by specifying a Mila email username that does not exist."
+                    f"Someone messed up the manual matching by specifying a Mila email username that does not exist, or not ANYMORE."
                 )
-            # Note that `matching[drac_account_username]` is itself a dict
-            # with user information from CC. It's not just a username string.
-            if drac_account_username in matching:
-                assert isinstance(matching[drac_account_username], dict)
-                DD_persons[mila_email_username][drac_source] = matching[
-                    drac_account_username
-                ]
+                # we don't want to raise an error here because it will break the pipeline
+                # we will just log the error and move on
+                logging.error(msg)
+                # raise ValueError(msg)
+            else:
+                # Note that `matching[drac_account_username]` is itself a dict
+                # with user information from CC. It's not just a username string.
+                if drac_account_username in matching:
+                    assert isinstance(matching[drac_account_username], dict)
+                    DD_persons[mila_email_username][drac_source] = matching[
+                        drac_account_username
+                    ]
 
 
 def _make_matches_status_report(DLD_data, DD_persons):

diff --git a/sarc/alerts/checks.py b/sarc/alerts/checks.py
@@ -0,0 +1,120 @@
+import random
+from dataclasses import dataclass
+from datetime import timedelta
+
+from sarc.alerts.common import CheckResult, HealthCheck
+from sarc.alerts.db_sanity_checks.users_accounts import check_users_in_jobs
+from sarc.alerts.usage_alerts.cluster_response import check_cluster_response
+from sarc.alerts.usage_alerts.cluster_scraping import check_nb_jobs_per_cluster_per_time
+
+
+# this is a simple check that will fail 50% of the time
+# it uses a custom result class to add more context to the result
+@dataclass
+class HelloWorldResult(CheckResult):
+    custom_comment: str = ""
+
+
+@dataclass
+class HelloWorldCheck(HealthCheck):
+    __result_class__ = HelloWorldResult
+
+    def check(self):
+        if random.random() < 0.5:
+            return self.fail(
+                custom_comment="Hello, HealthMonitor World! You were chosen randomly to fail..."
+            )
+        return self.ok(custom_comment="Hello, HealthMonitor World!")
+
+
+# this is a simple check that will fail 50% of the time
+# it uses the statuses dictionnary to add more context information to the result
+@dataclass
+class HelloWorld2Check(HealthCheck):
+    example_additionnal_param: str = "default_value"
+
+    def check(self):
+        random_number = random.random()
+        if random_number < 0.5:
+            return self.fail(
+                statuses={
+                    "comment": "Hello, HealthMonitor World! You were chosen randomly to fail...",
+                    "random_number": random_number,
+                    "example_additionnal_param": self.example_additionnal_param,
+                }
+            )
+        return self.ok(
+            statuses={
+                "comment": "Hello, HealthMonitor World!",
+                "random_number": random_number,
+                "example_additionnal_param": self.example_additionnal_param,
+            }
+        )
+
+
+# cheks if the cluster responded in the last `days` days
+@dataclass
+class ClusterResponseCheck(HealthCheck):
+    days: int = 7
+
+    def check(self):
+        cluster_name = self.parameters["cluster_name"]
+        days = self.days
+        # days = 7
+        if check_cluster_response(
+            time_interval=timedelta(days=days), cluster_name=cluster_name
+        ):
+            return self.ok
+        return self.fail(
+            statuses={
+                "comment": f"  Cluster {cluster_name} has not been scraped in the last {days} days."
+            }
+        )
+
+
+@dataclass
+class ClusterJobScrapingCheck(HealthCheck):
+    time_interval: int = 7
+    time_unit: int = 1
+    stddev: int = 2
+    verbose: bool = False
+
+    def check(self):
+        time_interval = timedelta(days=self.time_interval)
+        time_unit = timedelta(days=self.time_unit)
+        cluster_name = self.parameters["cluster_name"]
+        nb_stddev = self.stddev
+        verbose = self.verbose
+        if check_nb_jobs_per_cluster_per_time(
+            time_interval=time_interval,
+            time_unit=time_unit,
+            cluster_names=[cluster_name],
+            nb_stddev=nb_stddev,
+            verbose=verbose,
+        ):
+            return self.ok
+        return self.fail(
+            statuses={
+                "comment": f"Cluster {cluster_name} has not enough jobs scrapped",
+                "time_interval": time_interval,
+                "time_unit": time_unit,
+                "stddev": nb_stddev,
+            }
+        )
+
+
+@dataclass
+class UsersInJobsCheck(HealthCheck):
+    time_interval: int = 7  # days
+
+    def check(self):
+        time_interval = timedelta(days=self.time_interval)
+        missing_users = check_users_in_jobs(time_interval=time_interval)
+        if not missing_users:
+            return self.ok
+        return self.fail(
+            statuses={
+                "comment": f"Missing users in jobs: {missing_users}",
+                "time_interval": time_interval,
+            }
+        )
diff --git a/sarc/alerts/usage_alerts/cluster_response.py b/sarc/alerts/usage_alerts/cluster_response.py
@@ -7,7 +7,9 @@
 logger = logging.getLogger(__name__)
 
 
-def check_cluster_response(time_interval: timedelta = timedelta(days=7)):
+def check_cluster_response(
+    time_interval: timedelta = timedelta(days=7), cluster_name=None
+):
     """
     Check if we scraped clusters recently.
     Log a warning for each cluster not scraped since `time_interval` from now.
@@ -24,11 +26,18 @@ def check_cluster_response(time_interval: timedelta = timedelta(days=7)):
     # Get the oldest date allowed from now
     oldest_allowed_date = current_date - time_interval
     # Check each available cluster
-    for cluster in get_available_clusters():
+    clusters = (
+        [c for c in get_available_clusters() if c.cluster_name == cluster_name]
+        if cluster_name
+        else get_available_clusters()
+    )
+    result = True
+    for cluster in clusters:
         if cluster.end_date is None:
             logger.warning(
                 f"[{cluster.cluster_name}] no end_date available, cannot check last scraping"
             )
+            result = False
         else:
             # Cluster's latest scraping date should be in `cluster.end_date`.
             # NB: We assume cluster's `end_date` is stored as a date string,
@@ -44,3 +53,5 @@ def check_cluster_response(time_interval: timedelta = timedelta(days=7)):
                     f"oldest required: {oldest_allowed_date}, "
                     f"current time: {current_date}"
                 )
+                result = False
+    return result
diff --git a/sarc/alerts/usage_alerts/cluster_scraping.py b/sarc/alerts/usage_alerts/cluster_scraping.py
@@ -65,6 +65,8 @@ def check_nb_jobs_per_cluster_per_time(
     else:
         cluster_names = sorted(df["cluster_name"].unique())
 
+    result = True  # by default, everything's ok
+
     # Iter for each cluster.
     for cluster_name in cluster_names:
         # Select only jobs for current cluster,
@@ -127,3 +129,6 @@ def check_nb_jobs_per_cluster_per_time(
                         f"minimum required for this cluster: {threshold} ({avg} - {nb_stddev} * {stddev}); "
                         f"time unit: {time_unit}"
                     )
+                    result = False
+
+    return result
diff --git a/sarc/cli/health/check.py b/sarc/cli/health/check.py
@@ -5,7 +5,7 @@
 
 import gifnoc
 
-from sarc.alerts.common import CheckStatus
+from sarc.alerts.common import CheckStatus, config
 from sarc.alerts.runner import CheckRunner
 from sarc.config import config
 
@@ -16,6 +16,7 @@
 class HealthCheckCommand:
     config: Path = None
     once: bool = False
+    write: bool = False
 
     name: str = None
 
@@ -24,24 +25,21 @@ def execute(self) -> int:
         with gifnoc.use(self.config):
             if self.name:
                 # only run one check, once (no CheckRunner)
-                check = hcfg.checks[self.name]
-                results = check(write=False)
-                pprint(results)
-                for k, status in results.statuses.items():
-                    print(f"{status.name} -- {k}")
-                print(f"{results.status.name}")
+                check = config.checks[self.name]
+                results = check(write=self.write)
+                if results.status == CheckStatus.OK:
+                    print(f"Check '{check.name}' succeeded.")
+                else:
+                    print(f"Check '{check.name}' failed.")
+                    pprint(results)
             elif self.once:
-                # run all checks, once (no CheckRunner)
-                for check in [c for c in hcfg.checks.values() if c.active]:
-                    results = check(write=False)
+                for check in [c for c in config.checks.values() if c.active]:
+                    results = check(write=self.write)
                     if results.status == CheckStatus.OK:
                         print(f"Check '{check.name}' succeeded.")
                     else:
                         print(f"Check '{check.name}' failed.")
                         pprint(results)
-                        for k, status in results.statuses.items():
-                            print(f"{status.name} -- {k}")
-                        print(f"{results.status.name}")
             else:
                 try:
                     runner = CheckRunner(directory=hcfg.directory, checks=hcfg.checks)

diff --git a/sarc/logging.py b/sarc/logging.py
@@ -68,6 +68,6 @@ def setupLogging(verbose_level: int = 0):
             handlers=[logging.StreamHandler()],
             format="%(asctime)-15s::%(levelname)s::%(name)s::%(message)s",
             level=verbose_levels.get(
-                verbose_level, logging.DEBUG
+                verbose_level, logging.WARNING
             ),  # Default log level, if not specidied in config
         )
diff --git a/sarc/users/supervisor.py b/sarc/users/supervisor.py
@@ -1,3 +1,4 @@
+import logging
 import re
 from dataclasses import dataclass, field
 from itertools import chain
@@ -136,17 +137,17 @@ def make_list(errors):
 
         def show_error(msg, array):
             if len(array) > 0:
-                print(f"{msg} {make_list(array)}")
+                logging.error(f"{msg} {make_list(array)}")
 
         show_error("     Missing supervisors:", self.no_supervisors)
         show_error("    Too many supervisors:", self.too_many_supervisors)
         show_error("        Prof and Student:", self.prof_and_student)
 
         if self.unknown_supervisors:
-            print(f"     Unknown supervisors: {self.unknown_supervisors}")
+            logging.warning(f"     Unknown supervisors: {self.unknown_supervisors}")
 
         if self.unknown_group:
-            print(f"           Unknown group: {self.unknown_group}")
+            logging.warning(f"           Unknown group: {self.unknown_group}")
 
 
 def _extract_supervisors_from_groups(