diff --git a/README.md b/README.md index fc3c50b..246587d 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,7 @@ spec: Cluster Health Validator allows customization for which platform and workload health checks are performed. This is specified as part of the ConfigMap as part of the deployment. ``` +--- apiVersion: v1 kind: ConfigMap metadata: @@ -87,11 +88,40 @@ Below details the health check modules available as part of the solution, with s | CheckGoogleGroupRBAC | Checks that Google Group RBAC has been enabled | | | CheckRobinCluster | Checks RobinCluster Health | | | CheckRootSyncs | Checks that RootSyncs are synced and have completed reconciling | | -| CheckVMRuntime | Checks that VMruntime is Ready, without any preflight failure | | +| CheckVMRuntime | Checks that VMRuntime is Ready, without any preflight failure | | | CheckVirtualMachines | Checks that the expected # of VMs are in a Running State | **namespace**: namespace to run check against
**count**: (Optional) expected # of VMs | | CheckDataVolumes | Checks that the expected # of Data Volumes are 100% imported and ready | **namespace**: namespace to run check against
**count**: (Optional) expected # of DVs | | CheckHttpEndpoints | Checks that a list of HTTP endpoints are reachable and return a successful status code | **endpoints**: A list of HTTP endpoints to check. Each endpoint has the following parameters: | +### on_failure property + +Each health check module supports an `on_failure` property that allows you to control the behavior of the health check when it fails. The `on_failure` property can be set to one of two values: + +- `fail` (default): If the health check fails, the entire group of checks (platform or workload) will be considered failed. +- `ignore`: If the health check fails, the failure will be logged and tracked in metrics, but it will not affect the overall health status of the group. + +This is useful for non-critical health checks that you want to monitor but not have affect the overall health status. + +Example: + +```yaml +platform_checks: +- name: Node Health + module: CheckNodes +- name: Robin Cluster Health + module: CheckRobinCluster + on_failure: ignore + +workload_checks: +- name: VM Workloads Health + module: CheckVirtualMachines + parameters: + namespace: vm-workloads + on_failure: fail +- name: VM Disk Health + module: CheckVirtualMachineDisks + on_failure: ignore +``` ## Building the image diff --git a/app/app.py b/app/app.py index d025a08..9ce9319 100644 --- a/app/app.py +++ b/app/app.py @@ -83,53 +83,59 @@ def create_health_check_cr(): def run_checks(): + app_config = read_config() global health_check_cr if not health_check_cr: health_check_cr = HealthCheck() - platform_checks = [] - workload_checks = [] - - app_config = read_config() - - for check in app_config.platform_checks: - if "parameters" in check: - platform_checks.append( - health_check_map[check["module"]](check["parameters"]) - ) + platform_checks_with_configs = [] + for check_config in app_config.platform_checks: + check_class = health_check_map[check_config["module"]] + if "parameters" in check_config: + instance = check_class(check_config["parameters"]) else: - platform_checks.append(health_check_map[check["module"]]()) - - for check in app_config.workload_checks: - if "parameters" in check: - workload_checks.append( - health_check_map[check["module"]](check["parameters"]) - ) + instance = check_class() + platform_checks_with_configs.append((instance, check_config)) + + workload_checks_with_configs = [] + for check_config in app_config.workload_checks: + check_class = health_check_map[check_config["module"]] + if "parameters" in check_config: + instance = check_class(check_config["parameters"]) else: - workload_checks.append(health_check_map[check["module"]]()) + instance = check_class() + workload_checks_with_configs.append((instance, check_config)) with concurrent.futures.ThreadPoolExecutor(max_workers=_MAX_WORKERS) as executor: platform_checks_futures = { - executor.submit(check.is_healthy): check.__class__.__name__ - for check in platform_checks + executor.submit(check.is_healthy): config + for check, config in platform_checks_with_configs } workload_checks_futures = { - executor.submit(check.is_healthy): check.__class__.__name__ - for check in workload_checks + executor.submit(check.is_healthy): config + for check, config in workload_checks_with_configs } def wait_on_futures(futures): checks_failed = [] for future in concurrent.futures.as_completed(futures): - name = futures[future] + config = futures[future] + name = config["name"] + on_failure = config.get("on_failure", "fail") try: if not future.result(): - checks_failed.append(name) + if on_failure == "fail": + checks_failed.append(name) + else: + logging.info(f"Check '{name}' failed but is set to be ignored.") # Handling k8s resource not found here as it is not # handled in the individual checks. except ApiException as e: if e.status == 404: - checks_failed.append(name) + if on_failure == "fail": + checks_failed.append(name) + else: + logging.info(f"Check '{name}' failed with 404 but is set to be ignored.") else: raise return checks_failed diff --git a/app/config.py b/app/config.py index f1e7dff..27e24cf 100644 --- a/app/config.py +++ b/app/config.py @@ -26,6 +26,7 @@ class HealthCheck(TypedDict): name: str module: str parameters: NotRequired[dict] = {} + on_failure: NotRequired[str] = "fail" class Config(BaseModel): diff --git a/app/test_app.py b/app/test_app.py new file mode 100644 index 0000000..2dfd619 --- /dev/null +++ b/app/test_app.py @@ -0,0 +1,151 @@ +import unittest +from unittest.mock import MagicMock, patch +import sys + +from prometheus_client import REGISTRY + +class TestApp(unittest.TestCase): + + def setUp(self): + self.load_config_patcher = patch('kubernetes.config.load_config') + self.mock_load_config = self.load_config_patcher.start() + + self.apiextensions_v1_api_patcher = patch('kubernetes.client.ApiextensionsV1Api') + self.mock_apiextensions_v1_api = self.apiextensions_v1_api_patcher.start() + + self.custom_objects_api_patcher = patch('kubernetes.client.CustomObjectsApi') + self.mock_custom_objects_api = self.custom_objects_api_patcher.start() + + # import app after patch + import app + self.app = app + + self.create_health_check_cr_patcher = patch('app.create_health_check_cr') + self.mock_create_health_check_cr = self.create_health_check_cr_patcher.start() + + def tearDown(self): + self.load_config_patcher.stop() + self.apiextensions_v1_api_patcher.stop() + self.custom_objects_api_patcher.stop() + self.create_health_check_cr_patcher.stop() + # Unregister metrics to prevent duplicate metric error + for metric in ['platform_health', 'workload_health']: + if metric in REGISTRY._names_to_collectors: + REGISTRY.unregister(REGISTRY._names_to_collectors[metric]) + + @patch('app.read_config') + @patch('app.health_check_cr') + @patch('app.health_check_map') + def test_run_checks_onfailure_ignore(self, mock_health_check_map, mock_health_check_cr, mock_read_config): + # Mock config + from config import Config + mock_config = Config( + platform_checks=[ + { + "name": "CheckNodes", + "module": "CheckNodes", + "on_failure": "ignore" + }, + { + "name": "CheckRobinCluster", + "module": "CheckRobinCluster", + "on_failure": "fail" + } + ], + workload_checks=[] + ) + mock_read_config.return_value = mock_config + + # Mock health check modules + mock_check_nodes_class = MagicMock() + mock_check_nodes_instance = mock_check_nodes_class.return_value + mock_check_nodes_instance.is_healthy.return_value = False # Fails + + mock_check_robin_cluster_class = MagicMock() + mock_check_robin_cluster_instance = mock_check_robin_cluster_class.return_value + mock_check_robin_cluster_instance.is_healthy.return_value = False # Fails + + mock_health_check_map.__getitem__.side_effect = lambda key: { + "CheckNodes": mock_check_nodes_class, + "CheckRobinCluster": mock_check_robin_cluster_class + }[key] + + # Run the checks + self.app.run_checks() + + # Assertions + mock_health_check_cr.update_status.assert_called_once_with( + ["CheckRobinCluster"], [] + ) + + @patch('app.read_config') + @patch('app.health_check_cr') + @patch('app.health_check_map') + def test_run_checks_onfailure_fail(self, mock_health_check_map, mock_health_check_cr, mock_read_config): + # Mock config + from config import Config + mock_config = Config( + platform_checks=[ + { + "name": "CheckNodes", + "module": "CheckNodes", + "on_failure": "fail" + } + ], + workload_checks=[] + ) + mock_read_config.return_value = mock_config + + # Mock health check modules + mock_check_nodes_class = MagicMock() + mock_check_nodes_instance = mock_check_nodes_class.return_value + mock_check_nodes_instance.is_healthy.return_value = False # Fails + + mock_health_check_map.__getitem__.side_effect = lambda key: { + "CheckNodes": mock_check_nodes_class + }[key] + + # Run the checks + self.app.run_checks() + + # Assertions + mock_health_check_cr.update_status.assert_called_once_with( + ["CheckNodes"], [] + ) + + @patch('app.read_config') + @patch('app.health_check_cr') + @patch('app.health_check_map') + def test_run_checks_onfailure_default(self, mock_health_check_map, mock_health_check_cr, mock_read_config): + # Mock config + from config import Config + mock_config = Config( + platform_checks=[ + { + "name": "CheckNodes", + "module": "CheckNodes" + } + ], + workload_checks=[] + ) + mock_read_config.return_value = mock_config + + # Mock health check modules + mock_check_nodes_class = MagicMock() + mock_check_nodes_instance = mock_check_nodes_class.return_value + mock_check_nodes_instance.is_healthy.return_value = False # Fails + + mock_health_check_map.__getitem__.side_effect = lambda key: { + "CheckNodes": mock_check_nodes_class + }[key] + + # Run the checks + self.app.run_checks() + + # Assertions + mock_health_check_cr.update_status.assert_called_once_with( + ["CheckNodes"], [] + ) + +if __name__ == "__main__": + unittest.main() diff --git a/app/test_config.py b/app/test_config.py index 6029e5b..0391365 100644 --- a/app/test_config.py +++ b/app/test_config.py @@ -32,4 +32,13 @@ def test_optional_module_parameters(self): result = read_config() self.assertEqual(len(result.platform_checks), 4) self.assertEqual(len(result.workload_checks), 2) - + + def test_on_failure_property(self): + os.environ["APP_CONFIG_PATH"] = "testdata/onfailure_config.yaml" + result = read_config() + self.assertEqual(result.platform_checks[1]["on_failure"], "ignore") + self.assertEqual(result.workload_checks[0]["on_failure"], "fail") + self.assertEqual(result.workload_checks[1]["on_failure"], "ignore") + # Check default value + self.assertNotIn("on_failure", result.platform_checks[0]) + diff --git a/app/testdata/onfailure_config.yaml b/app/testdata/onfailure_config.yaml new file mode 100644 index 0000000..4f6485e --- /dev/null +++ b/app/testdata/onfailure_config.yaml @@ -0,0 +1,16 @@ +platform_checks: +- name: Node Health + module: CheckNodes +- name: Robin Cluster Health + module: CheckRobinCluster + on_failure: ignore + +workload_checks: +- name: VM Workloads Health + module: CheckVirtualMachines + parameters: + namespace: vm-workloads + on_failure: fail +- name: VM Disk Health + module: CheckVirtualMachineDisks + on_failure: ignore diff --git a/base/kustomization.yaml b/base/kustomization.yaml index 3653643..5ad4ee2 100644 --- a/base/kustomization.yaml +++ b/base/kustomization.yaml @@ -11,4 +11,4 @@ resources: images: - name: ghcr.io/gdc-consumeredge/cluster-health-validator/cluster-health-validator - newTag: "v1.2.0" + newTag: "v1.2.1" diff --git a/build.sh b/build.sh index c8526fe..adcb06c 100755 --- a/build.sh +++ b/build.sh @@ -18,9 +18,9 @@ function display_common() { function build_container() { if [[ -f ".npmrc" ]]; then - docker build -f "${DOCKERFILE}" -t "${APP}:${VERSION}" . --secret id=npmrc,src=./.npmrc + docker build -f "${DOCKERFILE}" -t "${APP}:${VERSION}" . else - docker build -f "${DOCKERFILE}" -t "${APP}:${VERSION}" . --secret id=npmrc,src=./.npmrc + docker build -f "${DOCKERFILE}" -t "${APP}:${VERSION}" . fi if [[ $? -ne 0 ]]; then diff --git a/config/default/gdc-clusterdefault-generated.yaml b/config/default/gdc-clusterdefault-generated.yaml index 073c5bb..663554c 100644 --- a/config/default/gdc-clusterdefault-generated.yaml +++ b/config/default/gdc-clusterdefault-generated.yaml @@ -217,7 +217,7 @@ spec: value: INFO - name: APP_CONFIG_PATH value: /config/config.yaml - image: ghcr.io/gdc-consumeredge/cluster-health-validator/cluster-health-validator:v1.2.0 + image: ghcr.io/gdc-consumeredge/cluster-health-validator/cluster-health-validator:v1.2.1 livenessProbe: httpGet: path: /health diff --git a/package/gdc-cluster-health-pkg-default.yaml b/package/gdc-cluster-health-pkg-default.yaml index 073c5bb..663554c 100644 --- a/package/gdc-cluster-health-pkg-default.yaml +++ b/package/gdc-cluster-health-pkg-default.yaml @@ -217,7 +217,7 @@ spec: value: INFO - name: APP_CONFIG_PATH value: /config/config.yaml - image: ghcr.io/gdc-consumeredge/cluster-health-validator/cluster-health-validator:v1.2.0 + image: ghcr.io/gdc-consumeredge/cluster-health-validator/cluster-health-validator:v1.2.1 livenessProbe: httpGet: path: /health