Skip to content

Commit 61d8c17

Browse files
authored
Merge pull request #25 from GDC-ConsumerEdge/check_http_endpoints
Add a health check for HTTP endpoints and emit metrics from health checks
2 parents 570276b + c91e82f commit 61d8c17

19 files changed

+424
-28
lines changed

README.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ requiring an in-cluster component.
1212

1313
## Installation
1414

15-
This project uses a CRD and operator, and requires Cluster-Level access. The project can be deployed as a `RootSync` config-sync object with the following configuration. NOTE: Production use should clone the repo, make it private and use the `token` appraoch to authentiate to private repo.
15+
This project uses a CRD and operator, and requires Cluster-Level access. The project can be deployed as a `RootSync` config-sync object with the following configuration. NOTE: Production use should clone the repo, make it private and use the `token` approach to authenticate to private repo.
1616

1717
```yaml
1818
# root-sync.yaml
@@ -68,6 +68,15 @@ data:
6868
parameters:
6969
namespace: vm-workloads
7070
count: 4
71+
- name: HTTP Endpoints
72+
module: CheckHttpEndpoints
73+
parameters:
74+
endpoints:
75+
- name: Google
76+
url: https://www.google.com
77+
- name: Kubernetes API
78+
url: https://kubernetes.default.svc
79+
timeout: 5
7180
```
7281

7382
Below details the health check modules available as part of the solution, with some requiring parameters:
@@ -80,7 +89,8 @@ Below details the health check modules available as part of the solution, with s
8089
| CheckRootSyncs | Checks that RootSyncs are synced and have completed reconciling | |
8190
| CheckVMRuntime | Checks that VMruntime is Ready, without any preflight failure | |
8291
| CheckVirtualMachines | Checks that the expected # of VMs are in a Running State | **namespace**: namespace to run check against <br > **count**: (Optional) expected # of VMs |
83-
| CheckDataVolumes | Checks that the expected # of Data Volumes are 100% imported and ready | **namespace**: namespace to run check against <br > **count**: (Optional) expected # of DVs |
92+
| CheckDataVolumes | Checks that the expected # of Data Volumes are 100% imported and ready | **namespace**: namespace to run check against <br > **count**: (Optional) expected # of DVs |
93+
| CheckHttpEndpoints | Checks that a list of HTTP endpoints are reachable and return a successful status code | **endpoints**: A list of HTTP endpoints to check. Each endpoint has the following parameters: <ul><li> **name**: The name of the endpoint </li><li> **url**: The URL of the endpoint </li><li> **timeout**: (Optional) The timeout in seconds for the request </li><li> **method**: (Optional) The HTTP method to use (e.g. 'GET', 'POST') </li></ul> |
8494

8595

8696
## Building the image

app/app.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from check_virtual_machines import CheckVirtualMachines
1414
from check_virtual_machine_disks import CheckVirtualMachineDisks
1515
from check_vmruntime import CheckVMRuntime
16+
from check_http_endpoints import CheckHttpEndpoints
1617
from config import read_config
1718
from flask import Flask, abort
1819
from health_checks import HealthCheck
@@ -27,7 +28,7 @@
2728
platform_health_metric = Gauge("platform_health", "Platform Checks")
2829
workload_health_metric = Gauge("workload_health", "Workload Checks")
2930

30-
_MAX_WORKERS = os.environ.get("MAX_WORKERS", 10)
31+
_MAX_WORKERS = int(os.environ.get("MAX_WORKERS", 10))
3132
_ROBIN_MASTER_SVC_ENDPOINT = "robin-master.robinio.svc.cluster.local"
3233
_ROBIN_MASTER_SVC_METRICS_PORT = 29446
3334

@@ -39,7 +40,8 @@
3940
CheckVMRuntime.__name__: CheckVMRuntime,
4041
CheckDataVolumes.__name__: CheckDataVolumes,
4142
CheckVirtualMachines.__name__: CheckVirtualMachines,
42-
CheckVirtualMachineDisks.__name__: CheckVirtualMachineDisks
43+
CheckVirtualMachineDisks.__name__: CheckVirtualMachineDisks,
44+
CheckHttpEndpoints.__name__: CheckHttpEndpoints
4345
}
4446

4547

app/check_data_volumes.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,21 @@
22

33
from kubernetes import client
44
from pydantic import BaseModel
5+
from prometheus_client import Counter
56

67
log = logging.getLogger("check.datavolumes")
78

9+
DATA_VOLUME_SUCCESS_TOTAL = Counter(
10+
'data_volume_success_total',
11+
'Total number of successful data volume checks',
12+
['namespace']
13+
)
14+
DATA_VOLUME_FAILURE_TOTAL = Counter(
15+
'data_volume_failure_total',
16+
'Total number of failed data volume checks',
17+
['namespace']
18+
)
19+
820

921
class CheckDataVolumesParameters(BaseModel):
1022
namespace: str
@@ -30,6 +42,7 @@ def is_healthy(self):
3042
log.error(
3143
f'Found {len(resp.get("items"))} datavolumes but expected {self.count}.'
3244
)
45+
DATA_VOLUME_FAILURE_TOTAL.labels(namespace=self.namespace).inc()
3346
return False
3447

3548
# Assert that each data volume is 100% imported and ready
@@ -38,13 +51,16 @@ def is_healthy(self):
3851
log.error(
3952
f'DataVolume {data_volume.get("metadata").get("name")} phase not succeeded'
4053
)
54+
DATA_VOLUME_FAILURE_TOTAL.labels(namespace=self.namespace).inc()
4155
return False
4256

4357
if data_volume.get("status").get("progress") != "100.0%":
4458
log.error(
4559
f'DataVolume {data_volume.get("metadata").get("name")} not imported'
4660
)
61+
DATA_VOLUME_FAILURE_TOTAL.labels(namespace=self.namespace).inc()
4762
return False
4863

4964
log.info("Check data volumes passed")
65+
DATA_VOLUME_SUCCESS_TOTAL.labels(namespace=self.namespace).inc()
5066
return True

app/check_google_group_rbac.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
11
import logging
22

33
from kubernetes import client
4+
from prometheus_client import Counter
45

56
log = logging.getLogger("check.googlegrouprbac")
67

8+
GOOGLE_GROUP_RBAC_SUCCESS_TOTAL = Counter(
9+
'google_group_rbac_success_total',
10+
'Total number of successful Google Group RBAC checks'
11+
)
12+
GOOGLE_GROUP_RBAC_FAILURE_TOTAL = Counter(
13+
'google_group_rbac_failure_total',
14+
'Total number of failed Google Group RBAC checks'
15+
)
16+
717

818
class CheckGoogleGroupRBAC:
919
def is_healthy(self):
@@ -22,15 +32,18 @@ def is_healthy(self):
2232
log.error(
2333
"Did not find expected default.kube-public clientconfig object"
2434
)
35+
GOOGLE_GROUP_RBAC_FAILURE_TOTAL.inc()
2536
return False
2637

2738
for auth in clientconfig.get("spec").get("authentication"):
2839
if auth.get("name") == "google-authentication-method":
2940
log.info("Check Google Group RBAC passed")
41+
GOOGLE_GROUP_RBAC_SUCCESS_TOTAL.inc()
3042
return True
3143

3244
except Exception as err:
3345
log.error("An error occurred parsing the clientconfig %s", err)
3446

3547
log.info("Check GoogleGroupRBAC failed")
48+
GOOGLE_GROUP_RBAC_FAILURE_TOTAL.inc()
3649
return False

app/check_http_endpoints.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
2+
import logging
3+
import time
4+
import requests
5+
import concurrent.futures
6+
from pydantic import BaseModel, Field
7+
from typing import List
8+
from prometheus_client import Counter, Histogram
9+
import socket
10+
from urllib.parse import urlparse
11+
12+
log = logging.getLogger('check.http_endpoints')
13+
14+
HTTP_ENDPOINT_SUCCESS_TOTAL = Counter(
15+
'http_endpoint_success_total',
16+
'Total number of successful HTTP endpoint checks',
17+
['endpoint_name']
18+
)
19+
HTTP_ENDPOINT_FAILURE_TOTAL = Counter(
20+
'http_endpoint_failure_total',
21+
'Total number of failed HTTP endpoint checks',
22+
['endpoint_name']
23+
)
24+
HTTP_ENDPOINT_LATENCY_SECONDS = Histogram(
25+
'http_endpoint_latency_seconds',
26+
'Latency of HTTP endpoint checks in seconds',
27+
['endpoint_name', 'status']
28+
)
29+
30+
class Endpoint(BaseModel):
31+
name: str
32+
url: str
33+
timeout: int = 10
34+
method: str = 'GET'
35+
36+
class HttpEndpointsParameters(BaseModel):
37+
endpoints: List[Endpoint] = Field(..., min_items=1)
38+
39+
class CheckHttpEndpoints:
40+
def __init__(self, parameters: dict):
41+
self.params = HttpEndpointsParameters(**parameters)
42+
43+
def is_healthy(self):
44+
with concurrent.futures.ThreadPoolExecutor() as executor:
45+
futures = {executor.submit(self.check_endpoint, endpoint): endpoint for endpoint in self.params.endpoints}
46+
for future in concurrent.futures.as_completed(futures):
47+
if not future.result():
48+
return False
49+
log.info("Check http endpoints passed")
50+
return True
51+
52+
def check_endpoint(self, endpoint: Endpoint):
53+
# Pre-resolve DNS to warm up the cache for timing purposes.
54+
try:
55+
parsed_url = urlparse(endpoint.url)
56+
hostname = parsed_url.hostname
57+
port = parsed_url.port or {'http': 80, 'https': 443}.get(parsed_url.scheme, 80)
58+
if hostname:
59+
socket.getaddrinfo(hostname, port)
60+
except (socket.gaierror, TypeError) as e:
61+
# Log the pre-resolution failure, but proceed. The actual request will handle the error.
62+
log.warning(f"DNS pre-resolution failed for {hostname}: {e}")
63+
64+
start_time = time.time()
65+
try:
66+
response = requests.request(endpoint.method, endpoint.url, timeout=endpoint.timeout)
67+
if not response.ok:
68+
log.error(f"HTTP endpoint {endpoint.name} ({endpoint.url}) returned status code {response.status_code}")
69+
HTTP_ENDPOINT_FAILURE_TOTAL.labels(endpoint_name=endpoint.name).inc()
70+
HTTP_ENDPOINT_LATENCY_SECONDS.labels(endpoint_name=endpoint.name, status='failure').observe(response.elapsed.total_seconds())
71+
return False
72+
except requests.exceptions.RequestException as e:
73+
log.error(f"Failed to connect to HTTP endpoint {endpoint.name} ({endpoint.url}): {e}")
74+
HTTP_ENDPOINT_FAILURE_TOTAL.labels(endpoint_name=endpoint.name).inc()
75+
HTTP_ENDPOINT_LATENCY_SECONDS.labels(endpoint_name=endpoint.name, status='failure').observe(time.time() - start_time)
76+
return False
77+
78+
HTTP_ENDPOINT_SUCCESS_TOTAL.labels(endpoint_name=endpoint.name).inc()
79+
HTTP_ENDPOINT_LATENCY_SECONDS.labels(endpoint_name=endpoint.name, status='success').observe(response.elapsed.total_seconds())
80+
return True

app/check_nodes.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,18 @@
11
from kubernetes import client
22
import logging
3+
from prometheus_client import Counter
34

45
log = logging.getLogger('check.nodes')
56

7+
NODE_HEALTH_SUCCESS_TOTAL = Counter(
8+
'node_health_success_total',
9+
'Total number of successful node health checks'
10+
)
11+
NODE_HEALTH_FAILURE_TOTAL = Counter(
12+
'node_health_failure_total',
13+
'Total number of failed node health checks'
14+
)
15+
616
class CheckNodes:
717
def is_healthy(self):
818
k8s = client.CoreV1Api()
@@ -16,7 +26,9 @@ def is_healthy(self):
1626

1727
if (not nodeReady):
1828
log.error(f"Node {node.metadata.name} is not ready.")
29+
NODE_HEALTH_FAILURE_TOTAL.inc()
1930
return False
2031

2132
log.info("Check nodes passed")
33+
NODE_HEALTH_SUCCESS_TOTAL.inc()
2234
return True

app/check_robin_cluster.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
11
import logging
22

33
from kubernetes import client
4+
from prometheus_client import Counter
45

56
log = logging.getLogger("check.robincluster")
67

8+
ROBIN_CLUSTER_SUCCESS_TOTAL = Counter(
9+
'robin_cluster_success_total',
10+
'Total number of successful Robin cluster checks'
11+
)
12+
ROBIN_CLUSTER_FAILURE_TOTAL = Counter(
13+
'robin_cluster_failure_total',
14+
'Total number of failed Robin cluster checks'
15+
)
16+
717

818
class CheckRobinCluster:
919
def is_healthy(self):
@@ -14,26 +24,31 @@ def is_healthy(self):
1424

1525
if len(resp.get("items")) != 1:
1626
log.error(f'Found {len(resp.get("items"))} robinclusters but wanted 1.')
27+
ROBIN_CLUSTER_FAILURE_TOTAL.inc()
1728
return False
1829

1930
# Assert that the overall robincluster status is Ready
2031
robin_cluster = resp.get("items")[0]
2132

2233
if robin_cluster.get("status").get("phase") != "Ready":
2334
log.error("Robin cluster not ready.")
35+
ROBIN_CLUSTER_FAILURE_TOTAL.inc()
2436
return False
2537

2638
robin_nodes = robin_cluster.get("status").get("robin_node_status")
2739

2840
if len(robin_nodes) != 3:
2941
log.error(f"Found {len(robin_nodes)} robin nodes but wanted 3.")
42+
ROBIN_CLUSTER_FAILURE_TOTAL.inc()
3043
return False
3144

3245
# Assert that robin_node_status contains 3 nodes and are all ONLINE and Ready
3346
for node in robin_nodes:
3447
if node.get("state") != "ONLINE" or node.get("status") != "Ready":
3548
log.error(f"Robin node ({node.host_name}) not online or ready.")
49+
ROBIN_CLUSTER_FAILURE_TOTAL.inc()
3650
return False
3751

3852
log.info("Check robin cluster passed")
53+
ROBIN_CLUSTER_SUCCESS_TOTAL.inc()
3954
return True

app/check_root_syncs.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,19 @@
22
import pprint
33

44
from kubernetes import client
5+
from prometheus_client import Counter
56

67
log = logging.getLogger("check.rootsyncs")
78

9+
ROOT_SYNCS_SUCCESS_TOTAL = Counter(
10+
'root_syncs_success_total',
11+
'Total number of successful root syncs checks'
12+
)
13+
ROOT_SYNCS_FAILURE_TOTAL = Counter(
14+
'root_syncs_failure_total',
15+
'Total number of failed root syncs checks'
16+
)
17+
818

919
class CheckRootSyncs:
1020
def is_healthy(self):
@@ -21,6 +31,7 @@ def is_healthy(self):
2131
log.error(
2232
f'Found {len(resp.get("items"))} rootsyncs but expected 1 or more.'
2333
)
34+
ROOT_SYNCS_FAILURE_TOTAL.inc()
2435
return False
2536

2637
# Assert that each root sync is synced and completed reconciling
@@ -33,6 +44,7 @@ def is_healthy(self):
3344
][0]
3445
if reconciling_condition.get("status") != "False":
3546
log.error(f'RootSync {root_sync.get("name")} is still reconciling')
47+
ROOT_SYNCS_FAILURE_TOTAL.inc()
3648
return False
3749

3850
syncing_condition = [
@@ -47,7 +59,9 @@ def is_healthy(self):
4759
log.error(
4860
f'RootSync {root_sync.get("metadata").get("name")} syncing not complete'
4961
)
62+
ROOT_SYNCS_FAILURE_TOTAL.inc()
5063
return False
5164

5265
log.info("Check root syncs passed")
66+
ROOT_SYNCS_SUCCESS_TOTAL.inc()
5367
return True

0 commit comments

Comments
 (0)