Skip to content

Commit d0e6cf3

Browse files
kwozymanCostin Gamenț
andauthored
Improve validation of operators (#8)
* Extend k8s connection Signed-off-by: Costin Gamenț <cos@redhat.com> * Add generic method for verfying deployment health Signed-off-by: Costin Gamenț <cos@redhat.com> * Add cert-manager deployments test Signed-off-by: Costin Gamenț <cos@redhat.com> * Add tests for sail-operator deployments readiness Signed-off-by: Costin Gamenț <cos@redhat.com> * Add test for lws operator deployment Signed-off-by: Costin Gamenț <cos@redhat.com> * Add kserve deployment test Signed-off-by: Costin Gamenț <cos@redhat.com> * update documentation Signed-off-by: Costin Gamenț <cos@redhat.com> * fix typos Signed-off-by: Costin Gamenț <cos@redhat.com> * fix help text for Makefile and README Signed-off-by: Costin Gamenț <cos@redhat.com> --------- Signed-off-by: Costin Gamenț <cos@redhat.com> Co-authored-by: Costin Gamenț <cos@redhat.com>
1 parent 2588474 commit d0e6cf3

File tree

2 files changed

+94
-11
lines changed

2 files changed

+94
-11
lines changed

validation/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,19 +64,25 @@ HOST_KUBECONFIG=/path/to/kube/config make run
6464
## Validations
6565

6666
Suite: cluster -- Cluster readiness tests
67+
6768
| Test name | Meaning |
6869
| --------- | ------- |
6970
| `cloud_provider` | The validation script tries to determine the cloud provider the cluster is running on. Can be overridden with `--cloud-provider` |
7071
| `instance_type` | At least one supported instance type must be present as a cluster node. See below for details. |
7172
| `gpu_availability` | At least one supported GPU must be available on a cluster node. Availability is determined by driver presence and node labels |
7273

7374
Suite: operators -- Operator readiness tests
75+
7476
| Test name | Meaning |
7577
| --------- | ------- |
7678
| `crd_certmanager` | The tool checks if cert-manager CRDs are present on the cluster |
79+
| `operator_certmanager` | Check if cert-manager deployments are ready |
7780
| `crd_sailoperator` | The tool checks if sail-operator CRDs are present on the cluster |
81+
| `operator_sail` | Check if sail-operator deployments are ready |
7882
| `crd_lwsoperator` | The tool checks if lws-operator CRDs are present on the cluster |
83+
| `operator_lws` | Check if lws-operator deployments are ready |
7984
| `crd_kserve` | The tool checks if kserve CRDs are present on the cluster |
85+
| `operator_kserve` | Check if kserve-controller-manager deployment is ready |
8086

8187
At the end, a brief report is printed with `PASSED` or `FAILED` status for each of the above tests and the suggested action the user should follow.
8288

validation/llmd_xks_checks.py

Lines changed: 88 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,8 @@ def __init__(self, **kwargs):
2323
self.logger.debug(f"Arguments: {kwargs}")
2424
self.logger.debug("LLMDXKSChecks initialized")
2525

26-
self.k8s_core_api, self.k8s_ext_api = self._k8s_connection()
27-
28-
if self.k8s_core_api is None or self.k8s_ext_api is None:
26+
self.k8s_client = self._k8s_connection()
27+
if self.k8s_client is None:
2928
self.logger.error("Failed to connect to Kubernetes cluster")
3029
sys.exit(1)
3130

@@ -70,13 +69,27 @@ def __init__(self, **kwargs):
7069
"suggested_action": "install cert-manager",
7170
"result": False
7271
},
72+
{
73+
"name": "operator_certmanager",
74+
"function": self.test_operator_certmanager,
75+
"description": "test if the cert-manager operator is running properly",
76+
"suggested_action": "install or verify cert-manager deployment",
77+
"result": False
78+
},
7379
{
7480
"name": "crd_sailoperator",
7581
"function": self.test_crd_sailoperator,
7682
"description": "test if the cluster has the sailoperator crds",
7783
"suggested_action": "install sail-operator",
7884
"result": False
7985
},
86+
{
87+
"name": "operator_sail",
88+
"function": self.test_operator_sail,
89+
"description": "test if the sail operator is running properly",
90+
"suggested_action": "install or verify sail operator deployment",
91+
"result": False
92+
},
8093
{
8194
"name": "crd_lwsoperator",
8295
"function": self.test_crd_lwsoperator,
@@ -85,6 +98,14 @@ def __init__(self, **kwargs):
8598
"result": False,
8699
"optional": True
87100
},
101+
{
102+
"name": "operator_lws",
103+
"function": self.test_operator_lws,
104+
"description": "test if the lws-operator is running properly",
105+
"suggested_action": "install or verify lws operator deployment",
106+
"result": False,
107+
"optional": True
108+
},
88109
{
89110
"name": "crd_kserve",
90111
"function": self.test_crd_kserve,
@@ -93,6 +114,13 @@ def __init__(self, **kwargs):
93114
"result": False,
94115
"optional": False
95116
},
117+
{
118+
"name": "operator_kserve",
119+
"function": self.test_operator_kserve,
120+
"description": "test if the kserve controller is running properly",
121+
"suggested_action": "install or verify kserve deployment",
122+
"result": False,
123+
},
96124
]
97125
}
98126
}
@@ -108,18 +136,18 @@ def _log_init(self):
108136
def _k8s_connection(self):
109137
try:
110138
kubernetes.config.load_kube_config(config_file=self.kube_config)
111-
core_api = kubernetes.client.CoreV1Api()
112-
ext_api = kubernetes.client.ApiextensionsV1Api()
139+
client = kubernetes.client
140+
client.CoreV1Api()
113141
except Exception as e:
114142
self.logger.error(f"{e}")
115-
return None, None
143+
return None
116144
self.logger.info("Kubernetes connection established")
117-
return core_api, ext_api
145+
return client
118146

119147
def _get_all_crd_names(self, cache=True):
120148
if cache and self.crds_cache is not None:
121149
return self.crds_cache
122-
crd_list = self.k8s_ext_api.list_custom_resource_definition()
150+
crd_list = self.k8s_client.ApiextensionsV1Api().list_custom_resource_definition()
123151
crd_names = {crd.metadata.name for crd in crd_list.items}
124152
if cache:
125153
self.crds_cache = crd_names
@@ -136,6 +164,23 @@ def _test_crds_present(self, required_crds):
136164
self.logger.debug("All tested CRDs are present")
137165
return return_value
138166

167+
def _deployment_ready(self, namespace_name, deployment_name):
168+
try:
169+
deployment = self.k8s_client.AppsV1Api().read_namespaced_deployment(
170+
name=deployment_name, namespace=namespace_name)
171+
except Exception as e:
172+
self.logger.error(f"{e}")
173+
return False
174+
desired = deployment.spec.replicas
175+
ready = deployment.status.ready_replicas or 0
176+
if ready != desired:
177+
self.logger.warning(f"Deployment {namespace_name}/{deployment_name} has "
178+
f"only {ready} replicas out of {desired} desired")
179+
return False
180+
else:
181+
self.logger.info(f"Deployment {namespace_name}/{deployment_name} ready")
182+
return True
183+
139184
def test_crd_certmanager(self):
140185
required_crds = [
141186
"certificaterequests.cert-manager.io",
@@ -150,6 +195,18 @@ def test_crd_certmanager(self):
150195
self.logger.warning("Missing cert-manager CRDs")
151196
return False
152197

198+
def test_operator_certmanager(self):
199+
test_failed = False
200+
if not self._deployment_ready("cert-manager-operator", "cert-manager-operator-controller-manager"):
201+
test_failed = True
202+
if not self._deployment_ready("cert-manager", "cert-manager-webhook"):
203+
test_failed = True
204+
if not self._deployment_ready("cert-manager", "cert-manager-cainjector"):
205+
test_failed = True
206+
if not self._deployment_ready("cert-manager", "cert-manager"):
207+
test_failed = True
208+
return not test_failed
209+
153210
def test_crd_sailoperator(self):
154211
required_crds = [
155212
"istiocnis.sailoperator.io",
@@ -165,6 +222,14 @@ def test_crd_sailoperator(self):
165222
self.logger.warning("Missing sail-operator CRDs")
166223
return False
167224

225+
def test_operator_sail(self):
226+
test_failed = False
227+
if not self._deployment_ready("istio-system", "istiod"):
228+
test_failed = True
229+
if not self._deployment_ready("istio-system", "servicemesh-operator3"):
230+
test_failed = True
231+
return not test_failed
232+
168233
def test_crd_lwsoperator(self):
169234
required_crds = [
170235
"leaderworkersets.leaderworkerset.x-k8s.io"
@@ -176,6 +241,12 @@ def test_crd_lwsoperator(self):
176241
self.logger.warning("Missing lws-operator CRDs")
177242
return False
178243

244+
def test_operator_lws(self):
245+
test_failed = False
246+
if not self._deployment_ready("openshift-lws-operator", "openshift-lws-operator"):
247+
test_failed = True
248+
return not test_failed
249+
179250
def test_crd_kserve(self):
180251
required_crds = [
181252
"llminferenceservices.serving.kserve.io",
@@ -193,6 +264,12 @@ def test_crd_kserve(self):
193264
self.logger.warning("Missing kserve CRDs")
194265
return False
195266

267+
def test_operator_kserve(self):
268+
test_failed = False
269+
if not self._deployment_ready("opendatahub", "kserve-controller-manager"):
270+
test_failed = True
271+
return not test_failed
272+
196273
def test_gpu_availability(self):
197274
def nvidia_driver_present(node):
198275
allocatable = node.status.allocatable or {}
@@ -214,7 +291,7 @@ def nvidia_driver_present(node):
214291
"nvidia": 0,
215292
"other": 0,
216293
}
217-
nodes = self.k8s_core_api.list_node()
294+
nodes = self.k8s_client.CoreV1Api().list_node() or {}
218295
for node in nodes.items:
219296
labels = node.metadata.labels or {}
220297
if "nvidia.com/gpu.present" in labels:
@@ -240,7 +317,7 @@ def azure_instance_type():
240317
"Standard_ND96isr_H100_v5": 0,
241318
"Standard_ND96isr_H200_v5": 0,
242319
}
243-
nodes = self.k8s_core_api.list_node() or {}
320+
nodes = self.k8s_client.CoreV1Api().list_node() or {}
244321
for node in nodes.items:
245322
labels = node.metadata.labels
246323
instance_type = ""
@@ -274,7 +351,7 @@ def detect_cloud_provider(self):
274351
"none": 0,
275352
"azure": 0,
276353
}
277-
nodes = self.k8s_core_api.list_node() or {}
354+
nodes = self.k8s_client.CoreV1Api().list_node() or {}
278355
for node in nodes.items:
279356
labels = node.metadata.labels
280357
if "kubernetes.azure.com/cluster" in labels:

0 commit comments

Comments
 (0)