Skip to content

Commit 8c93f2d

Browse files
authored
OPNET-648: Add readiness and healthiness probes (#1315)
* test: add timeout to sync-operator Signed-off-by: Mat Kowalski <[email protected]> * test: collect logs from webhook container Signed-off-by: Mat Kowalski <[email protected]> * OPNET-648: Add readiness and healthiness probes to handler container Signed-off-by: Mat Kowalski <[email protected]> * OPNET-648: Add readiness and healthiness probes to operator container Signed-off-by: Mat Kowalski <[email protected]> --------- Signed-off-by: Mat Kowalski <[email protected]>
1 parent 5b7af4c commit 8c93f2d

File tree

8 files changed

+106
-1
lines changed

8 files changed

+106
-1
lines changed

automation/check-patch.e2e-k8s.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ teardown() {
1515
./cluster/kubectl.sh logs --tail=1000 -n nmstate -l app=kubernetes-nmstate-operator > $ARTIFACTS/kubernetes-nmstate-operator.pod.logs || true
1616
./cluster/kubectl.sh logs -p --tail=1000 -n nmstate -l app=kubernetes-nmstate-operator > $ARTIFACTS/kubernetes-nmstate-operator.pod.previous.logs || true
1717
./cluster/kubectl.sh describe pods -n nmstate -l app=kubernetes-nmstate-operator > $ARTIFACTS/kubernetes-nmstate-operator.pod.describe.logs || true
18+
./cluster/kubectl.sh logs --tail=1000 -n nmstate -l component=kubernetes-nmstate-webhook > $ARTIFACTS/kubernetes-nmstate-webhook.pod.logs || true
19+
./cluster/kubectl.sh logs -p --tail=1000 -n nmstate -l component=kubernetes-nmstate-webhook > $ARTIFACTS/kubernetes-nmstate-webhook.pod.previous.logs || true
20+
./cluster/kubectl.sh describe pods -n nmstate -l component=kubernetes-nmstate-webhook > $ARTIFACTS/kubernetes-nmstate-webhook.pod.describe.logs || true
1821
./cluster/kubectl.sh get events > $ARTIFACTS/cluster-events.logs || true
1922
make cluster-down
2023
# Don't fail if there is no logs

automation/check-patch.e2e-upgrade-k8s.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ teardown() {
1515
./cluster/kubectl.sh logs --tail=1000 -n nmstate -l app=kubernetes-nmstate-operator > $ARTIFACTS/kubernetes-nmstate-operator.pod.logs || true
1616
./cluster/kubectl.sh logs -p --tail=1000 -n nmstate -l app=kubernetes-nmstate-operator > $ARTIFACTS/kubernetes-nmstate-operator.pod.previous.logs || true
1717
./cluster/kubectl.sh describe pods -n nmstate -l app=kubernetes-nmstate-operator > $ARTIFACTS/kubernetes-nmstate-operator.pod.describe.logs || true
18+
./cluster/kubectl.sh logs --tail=1000 -n nmstate -l component=kubernetes-nmstate-webhook > $ARTIFACTS/kubernetes-nmstate-webhook.pod.logs || true
19+
./cluster/kubectl.sh logs -p --tail=1000 -n nmstate -l component=kubernetes-nmstate-webhook > $ARTIFACTS/kubernetes-nmstate-webhook.pod.previous.logs || true
20+
./cluster/kubectl.sh describe pods -n nmstate -l component=kubernetes-nmstate-webhook > $ARTIFACTS/kubernetes-nmstate-webhook.pod.describe.logs || true
1821
./cluster/kubectl.sh get events > $ARTIFACTS/cluster-events.logs || true
1922
make cluster-down
2023
# Don't fail if there is no logs

bundle/manifests/kubernetes-nmstate-operator.clusterserviceversion.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,28 @@ spec:
243243
value: quay.io/openshift/origin-kube-rbac-proxy:4.10.0
244244
image: quay.io/nmstate/kubernetes-nmstate-operator:latest
245245
imagePullPolicy: IfNotPresent
246+
livenessProbe:
247+
failureThreshold: 3
248+
httpGet:
249+
path: /healthz
250+
port: healthprobe
251+
initialDelaySeconds: 10
252+
periodSeconds: 10
253+
successThreshold: 1
254+
timeoutSeconds: 1
246255
name: nmstate-operator
256+
ports:
257+
- containerPort: 8081
258+
name: healthprobe
259+
readinessProbe:
260+
failureThreshold: 3
261+
httpGet:
262+
path: /readyz
263+
port: healthprobe
264+
initialDelaySeconds: 10
265+
periodSeconds: 10
266+
successThreshold: 1
267+
timeoutSeconds: 1
247268
resources:
248269
limits:
249270
cpu: 500m

cluster/sync-operator.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ function wait_ready_operator() {
2626
sleep 5
2727

2828
# Wait for deployment rollout
29-
if ! $kubectl rollout status -w -n ${OPERATOR_NAMESPACE} deployment nmstate-operator; then
29+
if ! $kubectl rollout status -w -n ${OPERATOR_NAMESPACE} deployment nmstate-operator --timeout=2m; then
3030
echo "Operator haven't turned ready within the given timeout"
3131
return 1
3232
fi

cmd/operator/main.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
_ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
3131
ctrl "sigs.k8s.io/controller-runtime"
3232
"sigs.k8s.io/controller-runtime/pkg/client"
33+
"sigs.k8s.io/controller-runtime/pkg/healthz"
3334
"sigs.k8s.io/controller-runtime/pkg/log/zap"
3435
"sigs.k8s.io/controller-runtime/pkg/manager"
3536
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server"
@@ -88,6 +89,7 @@ func main() {
8889
Metrics: metricsserver.Options{
8990
BindAddress: "0", // disable metrics
9091
},
92+
HealthProbeBindAddress: ":8081",
9193
}
9294

9395
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrlOptions)
@@ -96,6 +98,10 @@ func main() {
9698
os.Exit(1)
9799
}
98100

101+
if err = setupHealthProbes(mgr); err != nil {
102+
os.Exit(1)
103+
}
104+
99105
err = setupOperatorController(mgr)
100106
if err != nil {
101107
setupLog.Error(err, "unable to setup controller", "controller", "NMState")
@@ -146,3 +152,15 @@ func setProfiler() {
146152
}()
147153
}
148154
}
155+
156+
func setupHealthProbes(mgr manager.Manager) error {
157+
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
158+
setupLog.Error(err, "unable to set up healthz check")
159+
return err
160+
}
161+
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil {
162+
setupLog.Error(err, "unable to set up readyz check")
163+
return err
164+
}
165+
return nil
166+
}

deploy/handler/operator.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,19 @@ spec:
9090
- containerPort: 8443
9191
name: metrics
9292
protocol: TCP
93+
readinessProbe:
94+
tcpSocket:
95+
port: metrics
96+
initialDelaySeconds: 10
97+
periodSeconds: 10
98+
livenessProbe:
99+
tcpSocket:
100+
port: metrics
101+
initialDelaySeconds: 10
102+
periodSeconds: 10
103+
timeoutSeconds: 1
104+
successThreshold: 1
105+
failureThreshold: 3
93106
resources:
94107
requests:
95108
cpu: "10m"
@@ -185,6 +198,19 @@ spec:
185198
value: application/json
186199
initialDelaySeconds: 10
187200
periodSeconds: 10
201+
livenessProbe:
202+
httpGet:
203+
path: /readyz
204+
port: webhook-server
205+
scheme: HTTPS
206+
httpHeaders:
207+
- name: Content-Type
208+
value: application/json
209+
initialDelaySeconds: 10
210+
periodSeconds: 10
211+
timeoutSeconds: 1
212+
successThreshold: 1
213+
failureThreshold: 3
188214
volumeMounts:
189215
- name: tls-key-pair
190216
readOnly: true

deploy/openshift/ui-plugin/deployment.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,19 @@ spec:
4242
limits:
4343
cpu: "500m"
4444
memory: "128Mi"
45+
readinessProbe:
46+
tcpSocket:
47+
port: { { .PluginPort } }
48+
initialDelaySeconds: 10
49+
periodSeconds: 10
50+
livenessProbe:
51+
tcpSocket:
52+
port: { { .PluginPort } }
53+
initialDelaySeconds: 10
54+
periodSeconds: 10
55+
timeoutSeconds: 1
56+
successThreshold: 1
57+
failureThreshold: 3
4558
terminationMessagePolicy: FallbackToLogsOnError
4659
volumeMounts:
4760
- name: plugin-serving-cert

deploy/operator/operator.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,27 @@ spec:
5757
capabilities:
5858
drop:
5959
- ALL
60+
readinessProbe:
61+
httpGet:
62+
path: /readyz
63+
port: healthprobe
64+
initialDelaySeconds: 10
65+
periodSeconds: 10
66+
timeoutSeconds: 1
67+
successThreshold: 1
68+
failureThreshold: 3
69+
livenessProbe:
70+
httpGet:
71+
path: /healthz
72+
port: healthprobe
73+
initialDelaySeconds: 10
74+
periodSeconds: 10
75+
timeoutSeconds: 1
76+
successThreshold: 1
77+
failureThreshold: 3
78+
ports:
79+
- containerPort: 8081
80+
name: healthprobe
6081
resources:
6182
requests:
6283
cpu: "60m"

0 commit comments

Comments
 (0)