Skip to content

Commit 54cdd5d

Browse files
authored
Merge pull request #17 from GDC-ConsumerEdge/cli
feat: Add a cli to run cluster health validator locally
2 parents 36dfe6c + dfa166b commit 54cdd5d

File tree

3 files changed

+202
-2
lines changed

3 files changed

+202
-2
lines changed

CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
# Root code owner
2-
* @mike-ensor
2+
* @mike-ensor @benfogel

README.md

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,14 @@
22

33
The cluster health validator is a service that runs in-cluster and reports an
44
aggregated signal of platform and workload health. The health is reported both
5-
as a status as a Kubernetes and asa prometheus metric. This can be used during
5+
as a status as a Kubernetes and as a prometheus metric. This can be used during
66
cluster provisioning to signal to completion of the pre-staging process or as a
77
continual sanity check of the state of a cluster.
88

9+
Alternatively, the cluster health validator can run locally, useful for local
10+
troubleshooting or to use during the cluster provisioning process without
11+
requiring an in-cluster component.
12+
913
## Installation
1014

1115
This project uses a CRD and operator, and requires Cluster-Level access. The project can be deployed as a `RootSync` config-sync object with the following configuration. NOTE: Production use should clone the repo, make it private and use the `token` appraoch to authentiate to private repo.
@@ -86,3 +90,47 @@ IMAGE_TAG=gcr.io/${PROJECT_ID}/cluster-health-validator:1.0.0
8690
docker build -t ${IMAGE_TAG} .
8791
docker push ${IMAGE_TAG}
8892
```
93+
94+
## Local Usage
95+
96+
```
97+
python3 -m venv .venv
98+
source .venv/bin/activate
99+
pip install -r app/requirements.txt
100+
101+
python3 app --help
102+
usage: app [-h] [--health-check HEALTH_CHECK [HEALTH_CHECK ...]] [-v | -q] [-w] [-i INTERVAL] [-t TIMEOUT]
103+
104+
options:
105+
-h, --help show this help message and exit
106+
--health-check HEALTH_CHECK [HEALTH_CHECK ...]
107+
Set a health check to perform. For health checks requiring parameters, pass them in a key=value format as additional arguments. Example: --health-check
108+
checkvirtualmachines namespace=vm-workloads count=3
109+
-v, --verbose increase output verbosity; -vv for max verbosity
110+
-q, --quiet output errors only
111+
-w, --wait wait for health checks to pass before exiting
112+
-i INTERVAL, --interval INTERVAL
113+
interval to poll passing health checks
114+
-t TIMEOUT, --timeout TIMEOUT
115+
Overall timeout for health checks to pass
116+
```
117+
118+
Examples:
119+
120+
```
121+
# Run the default health checks (CheckNodes, CheckRootSyncs, CheckRobinCluster)
122+
python3 app
123+
124+
# Run customized health checks
125+
python3 app --health-check checknodes \
126+
--health-check checkrobincluster \
127+
--health-check checkrootsyncs \
128+
--health-check checkgooglegrouprbac \
129+
--health-check checkvirtualmachines namespace=vm-workloads count=3 \
130+
--health-check checkdatavolumes namespace=vm-workloads count=3
131+
132+
# Run default health checks and wait until all health checks pass.
133+
# Timeout after 1 hour if health checks don't pass
134+
python3 app --wait --interval 60 --timeout 3600
135+
136+
```

app/__main__.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import argparse
2+
import logging
3+
import sys
4+
import time
5+
from check_data_volumes import CheckDataVolumes
6+
from check_google_group_rbac import CheckGoogleGroupRBAC
7+
from check_nodes import CheckNodes
8+
from check_robin_cluster import CheckRobinCluster
9+
from check_root_syncs import CheckRootSyncs
10+
from check_virtual_machines import CheckVirtualMachines
11+
from check_vmruntime import CheckVMRuntime
12+
from kubernetes import config
13+
14+
health_check_map = {
15+
CheckGoogleGroupRBAC.__name__.lower(): CheckGoogleGroupRBAC,
16+
CheckNodes.__name__.lower(): CheckNodes,
17+
CheckRobinCluster.__name__.lower(): CheckRobinCluster,
18+
CheckRootSyncs.__name__.lower(): CheckRootSyncs,
19+
CheckVMRuntime.__name__.lower(): CheckVMRuntime,
20+
CheckDataVolumes.__name__.lower(): CheckDataVolumes,
21+
CheckVirtualMachines.__name__.lower(): CheckVirtualMachines,
22+
}
23+
24+
default_health_checks = [
25+
CheckNodes.__name__.lower(),
26+
CheckRobinCluster.__name__.lower(),
27+
CheckRootSyncs.__name__.lower()
28+
]
29+
30+
config.load_config()
31+
logging.basicConfig(stream=sys.stdout)
32+
logger = logging.getLogger('main')
33+
34+
def run_health_checks(args):
35+
checks = []
36+
37+
if args.health_check is None:
38+
# use default health checks
39+
logger.info('No health checks specified, using default health checks: ' + ', '.join(default_health_checks))
40+
checks = [health_check_map[check_name]() for check_name in default_health_checks]
41+
else:
42+
for health_check in args.health_check:
43+
if len(health_check) == 0:
44+
logger.error('No health check specified')
45+
return 1
46+
47+
check_name = health_check[0].lower()
48+
49+
if check_name not in health_check_map:
50+
logger.error('Unknown health check specified: ' + health_check)
51+
return 1
52+
53+
54+
if len(health_check) > 1:
55+
# Health check includes named parameters that need to be passed
56+
check_args = {}
57+
58+
for parameter in health_check[1:]:
59+
if "=" not in parameter:
60+
logger.error('Invalid parameter specified: ' + parameter + '. Parameters must be in the format key=value')
61+
return 1
62+
63+
key, value = parameter.split("=")
64+
check_args[key] = value
65+
66+
checks.append(health_check_map[check_name](check_args))
67+
else:
68+
checks.append(health_check_map[check_name]())
69+
70+
failed_health_checks = []
71+
72+
for check in checks:
73+
try:
74+
if not check.is_healthy():
75+
failed_health_checks.append(check.__class__.__name__)
76+
except Exception:
77+
failed_health_checks.append(check.__class__.__name__)
78+
79+
80+
if len(failed_health_checks) > 0:
81+
for failure in failed_health_checks:
82+
logger.error('Health check failed: ' + failure)
83+
return 1
84+
85+
logger.info('All health checks passed!')
86+
return 0
87+
88+
89+
def main() -> int:
90+
parser = argparse.ArgumentParser()
91+
92+
parser.add_argument(
93+
'--health-check',
94+
action='append',
95+
help='''Set a health check to perform.
96+
For health checks requiring parameters, pass them in a key=value format as additional arguments.
97+
Example: --health-check checkvirtualmachines namespace=vm-workloads count=3''',
98+
nargs='+')
99+
verbosity_mutex = parser.add_mutually_exclusive_group()
100+
verbosity_mutex.add_argument(
101+
'-v', '--verbose',
102+
action='count',
103+
default=0,
104+
help='increase output verbosity; -vv for max verbosity')
105+
verbosity_mutex.add_argument(
106+
'-q', '--quiet',
107+
action='store_true',
108+
help='output errors only')
109+
110+
parser.add_argument(
111+
'-w', '--wait',
112+
action='store_true',
113+
help='wait for health checks to pass before exiting')
114+
115+
parser.add_argument(
116+
'-i', '--interval',
117+
type=int,
118+
default=60,
119+
help='interval to poll passing health checks')
120+
121+
parser.add_argument(
122+
'-t', '--timeout',
123+
type=int,
124+
default=3600,
125+
help='Overall timeout for health checks to pass')
126+
127+
args = parser.parse_args()
128+
if args.quiet:
129+
logger.setLevel(logging.ERROR)
130+
elif args.verbose == 1:
131+
logger.setLevel(logging.INFO)
132+
elif args.verbose >= 2:
133+
logger.setLevel(logging.DEBUG)
134+
else:
135+
logger.setLevel(logging.WARNING)
136+
137+
if (args.wait):
138+
# Poll continuously unless all health checks pass
139+
max_loops = int(args.timeout / args.interval)
140+
for i in range(max_loops):
141+
if run_health_checks(args) == 0:
142+
return 0
143+
144+
time.sleep(args.interval)
145+
146+
logger.error('Timed out waiting for health checks to pass')
147+
return 1
148+
else:
149+
return run_health_checks(args)
150+
151+
if __name__ == '__main__':
152+
sys.exit(main())

0 commit comments

Comments
 (0)