GDC-ConsumerEdge
diff --git a/‎README.md‎
Lines changed: 12 additions & 2 deletions b/‎README.md‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎app/app.py‎
Lines changed: 4 additions & 2 deletions b/‎app/app.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎app/check_data_volumes.py‎
Lines changed: 16 additions & 0 deletions b/‎app/check_data_volumes.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎app/check_google_group_rbac.py‎
Lines changed: 13 additions & 0 deletions b/‎app/check_google_group_rbac.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎app/check_http_endpoints.py‎
Lines changed: 80 additions & 0 deletions b/‎app/check_http_endpoints.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎app/check_nodes.py‎
Lines changed: 12 additions & 0 deletions b/‎app/check_nodes.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎app/check_robin_cluster.py‎
Lines changed: 15 additions & 0 deletions b/‎app/check_robin_cluster.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎app/check_root_syncs.py‎
Lines changed: 14 additions & 0 deletions b/‎app/check_root_syncs.py‎
Lines changed: 14 additions & 0 deletions
@@ -12,7 +12,7 @@ requiring an in-cluster component.
 
 ## Installation
 
-This project uses a CRD and operator, and requires Cluster-Level access. The project can be deployed as a `RootSync` config-sync object with the following configuration. NOTE: Production use should clone the repo, make it private and use the `token` appraoch to authentiate to private repo.
+This project uses a CRD and operator, and requires Cluster-Level access. The project can be deployed as a `RootSync` config-sync object with the following configuration. NOTE: Production use should clone the repo, make it private and use the `token` approach to authenticate to private repo.
 
 ```yaml
 # root-sync.yaml
@@ -68,6 +68,15 @@ data:
       parameters:
         namespace: vm-workloads
         count: 4
+    - name: HTTP Endpoints
+      module: CheckHttpEndpoints
+      parameters:
+        endpoints:
+        - name: Google
+          url: https://www.google.com
+        - name: Kubernetes API
+          url: https://kubernetes.default.svc
+          timeout: 5
 ```
 
 Below details the health check modules available as part of the solution, with some requiring parameters:
@@ -80,7 +89,8 @@ Below details the health check modules available as part of the solution, with s
 | CheckRootSyncs       | Checks that RootSyncs are synced and have completed reconciling        |                                                                      |
 | CheckVMRuntime       | Checks that VMruntime is Ready, without any preflight failure          |                                                                      |
 | CheckVirtualMachines | Checks that the expected # of VMs are in a Running State               | **namespace**: namespace to run check against <br >   **count**: (Optional) expected # of VMs |
-| CheckDataVolumes     | Checks that the expected # of Data Volumes are 100% imported and ready | **namespace**: namespace to run check against <br >  **count**: (Optional) expected # of DVs |
+| CheckDataVolumes     | Checks that the expected # of Data Volumes are 100% imported and ready | **namespace**: namespace to run check against <br >   **count**: (Optional) expected # of DVs |
+| CheckHttpEndpoints   | Checks that a list of HTTP endpoints are reachable and return a successful status code | **endpoints**: A list of HTTP endpoints to check. Each endpoint has the following parameters: <ul><li> **name**: The name of the endpoint </li><li> **url**: The URL of the endpoint </li><li> **timeout**: (Optional) The timeout in seconds for the request </li><li> **method**: (Optional) The HTTP method to use (e.g. 'GET', 'POST') </li></ul> |
 
 
 ## Building the image
 
@@ -13,6 +13,7 @@
 from check_virtual_machines import CheckVirtualMachines
 from check_virtual_machine_disks import CheckVirtualMachineDisks
 from check_vmruntime import CheckVMRuntime
+from check_http_endpoints import CheckHttpEndpoints
 from config import read_config
 from flask import Flask, abort
 from health_checks import HealthCheck
@@ -27,7 +28,7 @@
 platform_health_metric = Gauge("platform_health", "Platform Checks")
 workload_health_metric = Gauge("workload_health", "Workload Checks")
 
-_MAX_WORKERS = os.environ.get("MAX_WORKERS", 10)
+_MAX_WORKERS = int(os.environ.get("MAX_WORKERS", 10))
 _ROBIN_MASTER_SVC_ENDPOINT = "robin-master.robinio.svc.cluster.local"
 _ROBIN_MASTER_SVC_METRICS_PORT = 29446
 
@@ -39,7 +40,8 @@
     CheckVMRuntime.__name__: CheckVMRuntime,
     CheckDataVolumes.__name__: CheckDataVolumes,
     CheckVirtualMachines.__name__: CheckVirtualMachines,
-    CheckVirtualMachineDisks.__name__: CheckVirtualMachineDisks
+    CheckVirtualMachineDisks.__name__: CheckVirtualMachineDisks,
+    CheckHttpEndpoints.__name__: CheckHttpEndpoints
 }
 
 
 
@@ -2,9 +2,21 @@
 
 from kubernetes import client
 from pydantic import BaseModel
+from prometheus_client import Counter
 
 log = logging.getLogger("check.datavolumes")
 
+DATA_VOLUME_SUCCESS_TOTAL = Counter(
+    'data_volume_success_total',
+    'Total number of successful data volume checks',
+    ['namespace']
+)
+DATA_VOLUME_FAILURE_TOTAL = Counter(
+    'data_volume_failure_total',
+    'Total number of failed data volume checks',
+    ['namespace']
+)
+
 
 class CheckDataVolumesParameters(BaseModel):
     namespace: str
@@ -30,6 +42,7 @@ def is_healthy(self):
             log.error(
                 f'Found {len(resp.get("items"))} datavolumes but expected {self.count}.'
             )
+            DATA_VOLUME_FAILURE_TOTAL.labels(namespace=self.namespace).inc()
             return False
 
         # Assert that each data volume is 100% imported and ready
@@ -38,13 +51,16 @@ def is_healthy(self):
                 log.error(
                     f'DataVolume {data_volume.get("metadata").get("name")} phase not succeeded'
                 )
+                DATA_VOLUME_FAILURE_TOTAL.labels(namespace=self.namespace).inc()
                 return False
 
             if data_volume.get("status").get("progress") != "100.0%":
                 log.error(
                     f'DataVolume {data_volume.get("metadata").get("name")} not imported'
                 )
+                DATA_VOLUME_FAILURE_TOTAL.labels(namespace=self.namespace).inc()
                 return False
 
         log.info("Check data volumes passed")
+        DATA_VOLUME_SUCCESS_TOTAL.labels(namespace=self.namespace).inc()
         return True
@@ -1,9 +1,19 @@
 import logging
 
 from kubernetes import client
+from prometheus_client import Counter
 
 log = logging.getLogger("check.googlegrouprbac")
 
+GOOGLE_GROUP_RBAC_SUCCESS_TOTAL = Counter(
+    'google_group_rbac_success_total',
+    'Total number of successful Google Group RBAC checks'
+)
+GOOGLE_GROUP_RBAC_FAILURE_TOTAL = Counter(
+    'google_group_rbac_failure_total',
+    'Total number of failed Google Group RBAC checks'
+)
+
 
 class CheckGoogleGroupRBAC:
     def is_healthy(self):
@@ -22,15 +32,18 @@ def is_healthy(self):
                 log.error(
                     "Did not find expected default.kube-public clientconfig object"
                 )
+                GOOGLE_GROUP_RBAC_FAILURE_TOTAL.inc()
                 return False
 
             for auth in clientconfig.get("spec").get("authentication"):
                 if auth.get("name") == "google-authentication-method":
                     log.info("Check Google Group RBAC passed")
+                    GOOGLE_GROUP_RBAC_SUCCESS_TOTAL.inc()
                     return True
 
         except Exception as err:
             log.error("An error occurred parsing the clientconfig %s", err)
 
         log.info("Check GoogleGroupRBAC failed")
+        GOOGLE_GROUP_RBAC_FAILURE_TOTAL.inc()
         return False
@@ -0,0 +1,80 @@
+
+import logging
+import time
+import requests
+import concurrent.futures
+from pydantic import BaseModel, Field
+from typing import List
+from prometheus_client import Counter, Histogram
+import socket
+from urllib.parse import urlparse
+
+log = logging.getLogger('check.http_endpoints')
+
+HTTP_ENDPOINT_SUCCESS_TOTAL = Counter(
+    'http_endpoint_success_total',
+    'Total number of successful HTTP endpoint checks',
+    ['endpoint_name']
+)
+HTTP_ENDPOINT_FAILURE_TOTAL = Counter(
+    'http_endpoint_failure_total',
+    'Total number of failed HTTP endpoint checks',
+    ['endpoint_name']
+)
+HTTP_ENDPOINT_LATENCY_SECONDS = Histogram(
+    'http_endpoint_latency_seconds',
+    'Latency of HTTP endpoint checks in seconds',
+    ['endpoint_name', 'status']
+)
+
+class Endpoint(BaseModel):
+    name: str
+    url: str
+    timeout: int = 10
+    method: str = 'GET'
+
+class HttpEndpointsParameters(BaseModel):
+    endpoints: List[Endpoint] = Field(..., min_items=1)
+
+class CheckHttpEndpoints:
+    def __init__(self, parameters: dict):
+        self.params = HttpEndpointsParameters(**parameters)
+
+    def is_healthy(self):
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = {executor.submit(self.check_endpoint, endpoint): endpoint for endpoint in self.params.endpoints}
+            for future in concurrent.futures.as_completed(futures):
+                if not future.result():
+                    return False
+        log.info("Check http endpoints passed")
+        return True
+
+    def check_endpoint(self, endpoint: Endpoint):
+        # Pre-resolve DNS to warm up the cache for timing purposes.
+        try:
+            parsed_url = urlparse(endpoint.url)
+            hostname = parsed_url.hostname
+            port = parsed_url.port or {'http': 80, 'https': 443}.get(parsed_url.scheme, 80)
+            if hostname:
+                socket.getaddrinfo(hostname, port)
+        except (socket.gaierror, TypeError) as e:
+            # Log the pre-resolution failure, but proceed. The actual request will handle the error.
+            log.warning(f"DNS pre-resolution failed for {hostname}: {e}")
+
+        start_time = time.time()
+        try:
+            response = requests.request(endpoint.method, endpoint.url, timeout=endpoint.timeout)
+            if not response.ok:
+                log.error(f"HTTP endpoint {endpoint.name} ({endpoint.url}) returned status code {response.status_code}")
+                HTTP_ENDPOINT_FAILURE_TOTAL.labels(endpoint_name=endpoint.name).inc()
+                HTTP_ENDPOINT_LATENCY_SECONDS.labels(endpoint_name=endpoint.name, status='failure').observe(response.elapsed.total_seconds())
+                return False
+        except requests.exceptions.RequestException as e:
+            log.error(f"Failed to connect to HTTP endpoint {endpoint.name} ({endpoint.url}): {e}")
+            HTTP_ENDPOINT_FAILURE_TOTAL.labels(endpoint_name=endpoint.name).inc()
+            HTTP_ENDPOINT_LATENCY_SECONDS.labels(endpoint_name=endpoint.name, status='failure').observe(time.time() - start_time)
+            return False
+
+        HTTP_ENDPOINT_SUCCESS_TOTAL.labels(endpoint_name=endpoint.name).inc()
+        HTTP_ENDPOINT_LATENCY_SECONDS.labels(endpoint_name=endpoint.name, status='success').observe(response.elapsed.total_seconds())
+        return True
@@ -1,8 +1,18 @@
 from kubernetes import client
 import logging
+from prometheus_client import Counter
 
 log = logging.getLogger('check.nodes')
 
+NODE_HEALTH_SUCCESS_TOTAL = Counter(
+    'node_health_success_total',
+    'Total number of successful node health checks'
+)
+NODE_HEALTH_FAILURE_TOTAL = Counter(
+    'node_health_failure_total',
+    'Total number of failed node health checks'
+)
+
 class CheckNodes:
     def is_healthy(self):
         k8s = client.CoreV1Api()
@@ -16,7 +26,9 @@ def is_healthy(self):
 
             if (not nodeReady):
                 log.error(f"Node {node.metadata.name} is not ready.")
+                NODE_HEALTH_FAILURE_TOTAL.inc()
                 return False
 
         log.info("Check nodes passed")
+        NODE_HEALTH_SUCCESS_TOTAL.inc()
         return True
@@ -1,9 +1,19 @@
 import logging
 
 from kubernetes import client
+from prometheus_client import Counter
 
 log = logging.getLogger("check.robincluster")
 
+ROBIN_CLUSTER_SUCCESS_TOTAL = Counter(
+    'robin_cluster_success_total',
+    'Total number of successful Robin cluster checks'
+)
+ROBIN_CLUSTER_FAILURE_TOTAL = Counter(
+    'robin_cluster_failure_total',
+    'Total number of failed Robin cluster checks'
+)
+
 
 class CheckRobinCluster:
     def is_healthy(self):
@@ -14,26 +24,31 @@ def is_healthy(self):
 
         if len(resp.get("items")) != 1:
             log.error(f'Found {len(resp.get("items"))} robinclusters but wanted 1.')
+            ROBIN_CLUSTER_FAILURE_TOTAL.inc()
             return False
 
         # Assert that the overall robincluster status is Ready
         robin_cluster = resp.get("items")[0]
 
         if robin_cluster.get("status").get("phase") != "Ready":
             log.error("Robin cluster not ready.")
+            ROBIN_CLUSTER_FAILURE_TOTAL.inc()
             return False
 
         robin_nodes = robin_cluster.get("status").get("robin_node_status")
 
         if len(robin_nodes) != 3:
             log.error(f"Found {len(robin_nodes)} robin nodes but wanted 3.")
+            ROBIN_CLUSTER_FAILURE_TOTAL.inc()
             return False
 
         # Assert that robin_node_status contains 3 nodes and are all ONLINE and Ready
         for node in robin_nodes:
             if node.get("state") != "ONLINE" or node.get("status") != "Ready":
                 log.error(f"Robin node ({node.host_name}) not online or ready.")
+                ROBIN_CLUSTER_FAILURE_TOTAL.inc()
                 return False
 
         log.info("Check robin cluster passed")
+        ROBIN_CLUSTER_SUCCESS_TOTAL.inc()
         return True
@@ -2,9 +2,19 @@
 import pprint
 
 from kubernetes import client
+from prometheus_client import Counter
 
 log = logging.getLogger("check.rootsyncs")
 
+ROOT_SYNCS_SUCCESS_TOTAL = Counter(
+    'root_syncs_success_total',
+    'Total number of successful root syncs checks'
+)
+ROOT_SYNCS_FAILURE_TOTAL = Counter(
+    'root_syncs_failure_total',
+    'Total number of failed root syncs checks'
+)
+
 
 class CheckRootSyncs:
     def is_healthy(self):
@@ -21,6 +31,7 @@ def is_healthy(self):
             log.error(
                 f'Found {len(resp.get("items"))} rootsyncs but expected 1 or more.'
             )
+            ROOT_SYNCS_FAILURE_TOTAL.inc()
             return False
 
         # Assert that each root sync is synced and completed reconciling
@@ -33,6 +44,7 @@ def is_healthy(self):
             ][0]
             if reconciling_condition.get("status") != "False":
                 log.error(f'RootSync {root_sync.get("name")} is still reconciling')
+                ROOT_SYNCS_FAILURE_TOTAL.inc()
                 return False
 
             syncing_condition = [
@@ -47,7 +59,9 @@ def is_healthy(self):
                 log.error(
                     f'RootSync {root_sync.get("metadata").get("name")} syncing not complete'
                 )
+                ROOT_SYNCS_FAILURE_TOTAL.inc()
                 return False
 
         log.info("Check root syncs passed")
+        ROOT_SYNCS_SUCCESS_TOTAL.inc()
         return True