@@ -23,9 +23,8 @@ def __init__(self, **kwargs):
2323 self .logger .debug (f"Arguments: { kwargs } " )
2424 self .logger .debug ("LLMDXKSChecks initialized" )
2525
26- self .k8s_core_api , self .k8s_ext_api = self ._k8s_connection ()
27-
28- if self .k8s_core_api is None or self .k8s_ext_api is None :
26+ self .k8s_client = self ._k8s_connection ()
27+ if self .k8s_client is None :
2928 self .logger .error ("Failed to connect to Kubernetes cluster" )
3029 sys .exit (1 )
3130
@@ -70,13 +69,27 @@ def __init__(self, **kwargs):
7069 "suggested_action" : "install cert-manager" ,
7170 "result" : False
7271 },
72+ {
73+ "name" : "operator_certmanager" ,
74+ "function" : self .test_operator_certmanager ,
75+ "description" : "test if the cert-manager operator is running properly" ,
76+ "suggested_action" : "install or verify cert-manager deployment" ,
77+ "result" : False
78+ },
7379 {
7480 "name" : "crd_sailoperator" ,
7581 "function" : self .test_crd_sailoperator ,
7682 "description" : "test if the cluster has the sailoperator crds" ,
7783 "suggested_action" : "install sail-operator" ,
7884 "result" : False
7985 },
86+ {
87+ "name" : "operator_sail" ,
88+ "function" : self .test_operator_sail ,
89+ "description" : "test if the sail operator is running properly" ,
90+ "suggested_action" : "install or verify sail operator deployment" ,
91+ "result" : False
92+ },
8093 {
8194 "name" : "crd_lwsoperator" ,
8295 "function" : self .test_crd_lwsoperator ,
@@ -85,6 +98,14 @@ def __init__(self, **kwargs):
8598 "result" : False ,
8699 "optional" : True
87100 },
101+ {
102+ "name" : "operator_lws" ,
103+ "function" : self .test_operator_lws ,
104+ "description" : "test if the lws-operator is running properly" ,
105+ "suggested_action" : "install or verify lws operator deployment" ,
106+ "result" : False ,
107+ "optional" : True
108+ },
88109 {
89110 "name" : "crd_kserve" ,
90111 "function" : self .test_crd_kserve ,
@@ -93,6 +114,13 @@ def __init__(self, **kwargs):
93114 "result" : False ,
94115 "optional" : False
95116 },
117+ {
118+ "name" : "operator_kserve" ,
119+ "function" : self .test_operator_kserve ,
120+ "description" : "test if the kserve controller is running properly" ,
121+ "suggested_action" : "install or verify kserve deployment" ,
122+ "result" : False ,
123+ },
96124 ]
97125 }
98126 }
@@ -108,18 +136,18 @@ def _log_init(self):
108136 def _k8s_connection (self ):
109137 try :
110138 kubernetes .config .load_kube_config (config_file = self .kube_config )
111- core_api = kubernetes .client . CoreV1Api ()
112- ext_api = kubernetes . client .ApiextensionsV1Api ()
139+ client = kubernetes .client
140+ client .CoreV1Api ()
113141 except Exception as e :
114142 self .logger .error (f"{ e } " )
115- return None , None
143+ return None
116144 self .logger .info ("Kubernetes connection established" )
117- return core_api , ext_api
145+ return client
118146
119147 def _get_all_crd_names (self , cache = True ):
120148 if cache and self .crds_cache is not None :
121149 return self .crds_cache
122- crd_list = self .k8s_ext_api .list_custom_resource_definition ()
150+ crd_list = self .k8s_client . ApiextensionsV1Api () .list_custom_resource_definition ()
123151 crd_names = {crd .metadata .name for crd in crd_list .items }
124152 if cache :
125153 self .crds_cache = crd_names
@@ -136,6 +164,23 @@ def _test_crds_present(self, required_crds):
136164 self .logger .debug ("All tested CRDs are present" )
137165 return return_value
138166
167+ def _deployment_ready (self , namespace_name , deployment_name ):
168+ try :
169+ deployment = self .k8s_client .AppsV1Api ().read_namespaced_deployment (
170+ name = deployment_name , namespace = namespace_name )
171+ except Exception as e :
172+ self .logger .error (f"{ e } " )
173+ return False
174+ desired = deployment .spec .replicas
175+ ready = deployment .status .ready_replicas or 0
176+ if ready != desired :
177+ self .logger .warning (f"Deployment { namespace_name } /{ deployment_name } has "
178+ f"only { ready } replicas out of { desired } desired" )
179+ return False
180+ else :
181+ self .logger .info (f"Deployment { namespace_name } /{ deployment_name } ready" )
182+ return True
183+
139184 def test_crd_certmanager (self ):
140185 required_crds = [
141186 "certificaterequests.cert-manager.io" ,
@@ -150,6 +195,18 @@ def test_crd_certmanager(self):
150195 self .logger .warning ("Missing cert-manager CRDs" )
151196 return False
152197
198+ def test_operator_certmanager (self ):
199+ test_failed = False
200+ if not self ._deployment_ready ("cert-manager-operator" , "cert-manager-operator-controller-manager" ):
201+ test_failed = True
202+ if not self ._deployment_ready ("cert-manager" , "cert-manager-webhook" ):
203+ test_failed = True
204+ if not self ._deployment_ready ("cert-manager" , "cert-manager-cainjector" ):
205+ test_failed = True
206+ if not self ._deployment_ready ("cert-manager" , "cert-manager" ):
207+ test_failed = True
208+ return not test_failed
209+
153210 def test_crd_sailoperator (self ):
154211 required_crds = [
155212 "istiocnis.sailoperator.io" ,
@@ -165,6 +222,14 @@ def test_crd_sailoperator(self):
165222 self .logger .warning ("Missing sail-operator CRDs" )
166223 return False
167224
225+ def test_operator_sail (self ):
226+ test_failed = False
227+ if not self ._deployment_ready ("istio-system" , "istiod" ):
228+ test_failed = True
229+ if not self ._deployment_ready ("istio-system" , "servicemesh-operator3" ):
230+ test_failed = True
231+ return not test_failed
232+
168233 def test_crd_lwsoperator (self ):
169234 required_crds = [
170235 "leaderworkersets.leaderworkerset.x-k8s.io"
@@ -176,6 +241,12 @@ def test_crd_lwsoperator(self):
176241 self .logger .warning ("Missing lws-operator CRDs" )
177242 return False
178243
244+ def test_operator_lws (self ):
245+ test_failed = False
246+ if not self ._deployment_ready ("openshift-lws-operator" , "openshift-lws-operator" ):
247+ test_failed = True
248+ return not test_failed
249+
179250 def test_crd_kserve (self ):
180251 required_crds = [
181252 "llminferenceservices.serving.kserve.io" ,
@@ -193,6 +264,12 @@ def test_crd_kserve(self):
193264 self .logger .warning ("Missing kserve CRDs" )
194265 return False
195266
267+ def test_operator_kserve (self ):
268+ test_failed = False
269+ if not self ._deployment_ready ("opendatahub" , "kserve-controller-manager" ):
270+ test_failed = True
271+ return not test_failed
272+
196273 def test_gpu_availability (self ):
197274 def nvidia_driver_present (node ):
198275 allocatable = node .status .allocatable or {}
@@ -214,7 +291,7 @@ def nvidia_driver_present(node):
214291 "nvidia" : 0 ,
215292 "other" : 0 ,
216293 }
217- nodes = self .k8s_core_api . list_node ()
294+ nodes = self .k8s_client . CoreV1Api (). list_node () or {}
218295 for node in nodes .items :
219296 labels = node .metadata .labels or {}
220297 if "nvidia.com/gpu.present" in labels :
@@ -240,7 +317,7 @@ def azure_instance_type():
240317 "Standard_ND96isr_H100_v5" : 0 ,
241318 "Standard_ND96isr_H200_v5" : 0 ,
242319 }
243- nodes = self .k8s_core_api .list_node () or {}
320+ nodes = self .k8s_client . CoreV1Api () .list_node () or {}
244321 for node in nodes .items :
245322 labels = node .metadata .labels
246323 instance_type = ""
@@ -274,7 +351,7 @@ def detect_cloud_provider(self):
274351 "none" : 0 ,
275352 "azure" : 0 ,
276353 }
277- nodes = self .k8s_core_api .list_node () or {}
354+ nodes = self .k8s_client . CoreV1Api () .list_node () or {}
278355 for node in nodes .items :
279356 labels = node .metadata .labels
280357 if "kubernetes.azure.com/cluster" in labels :
0 commit comments