Skip to content

Commit 5104fd9

Browse files
authored
Added readiness and liveness probes (#8488)
1 parent 5f27977 commit 5104fd9

File tree

20 files changed

+194
-6
lines changed

20 files changed

+194
-6
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
### Added
2+
3+
- \[Helm\] Readiness and liveness probes
4+
(<https://github.com/cvat-ai/cvat/pull/8488>)

cvat/apps/health/management/__init__.py

Whitespace-only changes.

cvat/apps/health/management/commands/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
import platform
3+
from datetime import datetime, timedelta
4+
from django.core.management.base import BaseCommand, CommandError
5+
from django.conf import settings
6+
from rq.worker import Worker
7+
import django_rq
8+
9+
10+
class Command(BaseCommand):
11+
help = "Check worker liveness in specified queues"
12+
13+
def add_arguments(self, parser):
14+
parser.add_argument("queue_names", nargs="+", type=str)
15+
16+
def handle(self, *args, **options):
17+
hostname = platform.node()
18+
for queue_name in options["queue_names"]:
19+
if queue_name not in settings.RQ_QUEUES:
20+
raise CommandError(f"Queue {queue_name} is not defined")
21+
22+
connection = django_rq.get_connection(queue_name)
23+
workers = [w for w in Worker.all(connection) if queue_name in w.queue_names() and w.hostname == hostname]
24+
25+
expected_workers = int(os.getenv("NUMPROCS", 1))
26+
27+
if len(workers) != expected_workers:
28+
raise CommandError("Number of registered workers does not match the expected number, " \
29+
f"actual: {len(workers)}, expected: {expected_workers}")
30+
for worker in workers:
31+
if datetime.now() - worker.last_heartbeat > timedelta(seconds=worker.worker_ttl):
32+
raise CommandError(f"It seems that worker {worker.name}, pid: {worker.pid} is dead")

helm-chart/Chart.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ type: application
1616
# This is the chart version. This version number should be incremented each time you make changes
1717
# to the chart and its templates, including the app version.
1818
# Versions are expected to follow Semantic Versioning (https://semver.org/)
19-
version: 0.13.2
19+
version: 0.14.0
2020

2121
# This is the version number of the application being deployed. This version number should be
2222
# incremented each time you make changes to the application. Versions are not expected to

helm-chart/templates/_helpers.tpl

+15
Original file line numberDiff line numberDiff line change
@@ -169,3 +169,18 @@ The name of the service account to use for backend pods
169169
key: CLICKHOUSE_PASSWORD
170170
{{- end }}
171171
{{- end }}
172+
173+
{{- define "cvat.backend.worker.livenessProbe" -}}
174+
{{- if .livenessProbe.enabled }}
175+
livenessProbe:
176+
exec:
177+
command:
178+
- python
179+
- manage.py
180+
- workerprobe
181+
{{- range .args }}
182+
- {{ . }}
183+
{{- end }}
184+
{{ toYaml (omit .livenessProbe "enabled") | indent 2}}
185+
{{- end }}
186+
{{- end }}

helm-chart/templates/cvat_backend/server/deployment.yml

+14
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,20 @@ spec:
6565
{{- end }}
6666
ports:
6767
- containerPort: 8080
68+
{{- if $localValues.readinessProbe.enabled }}
69+
readinessProbe:
70+
httpGet:
71+
path: /api/server/about
72+
port: 8080
73+
{{- toYaml (omit $localValues.readinessProbe "enabled") | nindent 12 }}
74+
{{- end }}
75+
{{- if $localValues.livenessProbe.enabled }}
76+
livenessProbe:
77+
httpGet:
78+
path: /api/server/about
79+
port: 8080
80+
{{- toYaml (omit $localValues.livenessProbe "enabled") | nindent 12 }}
81+
{{- end }}
6882
volumeMounts:
6983
{{- if not .Values.cvat.backend.disableDistinctCachePerService }}
7084
- mountPath: /home/django/data/cache

helm-chart/templates/cvat_backend/utils/deployment.yml

+3-2
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,9 @@ spec:
6060
{{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
6161
{{- toYaml . | nindent 10 }}
6262
{{- end }}
63-
ports:
64-
- containerPort: 8080
63+
{{- $probeArgs := list "notifications" "cleaning" -}}
64+
{{- $probeConfig := dict "args" $probeArgs "livenessProbe" $.Values.cvat.backend.worker.livenessProbe -}}
65+
{{ include "cvat.backend.worker.livenessProbe" $probeConfig | indent 10 }}
6566
volumeMounts:
6667
{{- if not .Values.cvat.backend.disableDistinctCachePerService }}
6768
- mountPath: /home/django/data/cache

helm-chart/templates/cvat_backend/worker_analyticsreports/deployment.yml

+3
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ spec:
6161
{{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
6262
{{- toYaml . | nindent 10 }}
6363
{{- end }}
64+
{{- $probeArgs := list "analytics_reports" -}}
65+
{{- $probeConfig := dict "args" $probeArgs "livenessProbe" $.Values.cvat.backend.worker.livenessProbe -}}
66+
{{ include "cvat.backend.worker.livenessProbe" $probeConfig | indent 10 }}
6467
{{- with concat .Values.cvat.backend.additionalVolumeMounts $localValues.additionalVolumeMounts }}
6568
volumeMounts:
6669
{{- toYaml . | nindent 10 }}

helm-chart/templates/cvat_backend/worker_annotation/deployment.yml

+3
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ spec:
6060
{{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
6161
{{- toYaml . | nindent 10 }}
6262
{{- end }}
63+
{{- $probeArgs := list "annotation" -}}
64+
{{- $probeConfig := dict "args" $probeArgs "livenessProbe" $.Values.cvat.backend.worker.livenessProbe -}}
65+
{{ include "cvat.backend.worker.livenessProbe" $probeConfig | indent 10 }}
6366
volumeMounts:
6467
{{- if not .Values.cvat.backend.disableDistinctCachePerService }}
6568
- mountPath: /home/django/data/cache

helm-chart/templates/cvat_backend/worker_export/deployment.yml

+3
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ spec:
6161
{{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
6262
{{- toYaml . | nindent 10 }}
6363
{{- end }}
64+
{{- $probeArgs := list "export" -}}
65+
{{- $probeConfig := dict "args" $probeArgs "livenessProbe" $.Values.cvat.backend.worker.livenessProbe -}}
66+
{{ include "cvat.backend.worker.livenessProbe" $probeConfig | indent 10 }}
6467
volumeMounts:
6568
{{- if not .Values.cvat.backend.disableDistinctCachePerService }}
6669
- mountPath: /home/django/data/cache

helm-chart/templates/cvat_backend/worker_import/deployment.yml

+3
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ spec:
6060
{{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
6161
{{- toYaml . | nindent 10 }}
6262
{{- end }}
63+
{{- $probeArgs := list "import" -}}
64+
{{- $probeConfig := dict "args" $probeArgs "livenessProbe" $.Values.cvat.backend.worker.livenessProbe -}}
65+
{{ include "cvat.backend.worker.livenessProbe" $probeConfig | indent 10 }}
6366
volumeMounts:
6467
{{- if not .Values.cvat.backend.disableDistinctCachePerService }}
6568
- mountPath: /home/django/data/cache

helm-chart/templates/cvat_backend/worker_qualityreports/deployment.yml

+3
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ spec:
6060
{{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
6161
{{- toYaml . | nindent 10 }}
6262
{{- end }}
63+
{{- $probeArgs := list "quality_reports" -}}
64+
{{- $probeConfig := dict "args" $probeArgs "livenessProbe" $.Values.cvat.backend.worker.livenessProbe -}}
65+
{{ include "cvat.backend.worker.livenessProbe" $probeConfig | indent 10 }}
6366
{{- with concat .Values.cvat.backend.additionalVolumeMounts $localValues.additionalVolumeMounts }}
6467
volumeMounts:
6568
{{- toYaml . | nindent 10 }}

helm-chart/templates/cvat_backend/worker_webhooks/deployment.yml

+3
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ spec:
6060
{{- with concat .Values.cvat.backend.additionalEnv $localValues.additionalEnv }}
6161
{{- toYaml . | nindent 10 }}
6262
{{- end }}
63+
{{- $probeArgs := list "webhooks" -}}
64+
{{- $probeConfig := dict "args" $probeArgs "livenessProbe" $.Values.cvat.backend.worker.livenessProbe -}}
65+
{{ include "cvat.backend.worker.livenessProbe" $probeConfig | indent 10 }}
6366
{{- with concat .Values.cvat.backend.additionalVolumeMounts $localValues.additionalVolumeMounts }}
6467
volumeMounts:
6568
{{- toYaml . | nindent 10 }}

helm-chart/templates/cvat_frontend/deployment.yml

+12
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,18 @@ spec:
4545
{{- toYaml . | nindent 10 }}
4646
{{- end }}
4747
{{- with .Values.cvat.frontend.additionalVolumeMounts }}
48+
{{- if .Values.cvat.frontend.readinessProbe.enabled }}
49+
readinessProbe:
50+
tcpSocket:
51+
port: 80
52+
{{- toYaml (omit .Values.cvat.frontend.readinessProbe "enabled") | nindent 12 }}
53+
{{- end }}
54+
{{- if .Values.cvat.frontend.livenessProbe.enabled }}
55+
livenessProbe:
56+
tcpSocket:
57+
port: 80
58+
{{- toYaml (omit .Values.cvat.frontend.livenessProbe "enabled") | nindent 12 }}
59+
{{- end }}
4860
volumeMounts:
4961
{{- toYaml . | nindent 10 }}
5062
{{- end }}

helm-chart/templates/cvat_kvrocks/statefulset.yml

+19
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,25 @@ spec:
5959
{{- with .Values.cvat.kvrocks.additionalEnv }}
6060
{{- toYaml . | nindent 10 }}
6161
{{- end }}
62+
#https://github.com/apache/kvrocks/blob/unstable/Dockerfile
63+
{{- if .Values.cvat.kvrocks.readinessProbe.enabled }}
64+
readinessProbe:
65+
exec:
66+
command:
67+
- /bin/sh
68+
- -c
69+
- ./bin/redis-cli -p 6666 PING | grep -E '(PONG|NOAUTH)'
70+
{{- toYaml (omit .Values.cvat.kvrocks.readinessProbe "enabled") | nindent 12 }}
71+
{{- end }}
72+
{{- if .Values.cvat.kvrocks.livenessProbe.enabled }}
73+
livenessProbe:
74+
exec:
75+
command:
76+
- /bin/sh
77+
- -c
78+
- ./bin/redis-cli -p 6666 PING | grep -E '(PONG|NOAUTH)'
79+
{{- toYaml (omit .Values.cvat.kvrocks.livenessProbe "enabled") | nindent 12 }}
80+
{{- end }}
6281
volumeMounts:
6382
- name: {{ .Release.Name }}-kvrocks-data
6483
mountPath: /var/lib/kvrocks/data

helm-chart/templates/cvat_opa/deployment.yml

+14
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,20 @@ spec:
5353
env:
5454
{{- toYaml . | nindent 10 }}
5555
{{- end }}
56+
{{- if .Values.cvat.opa.readinessProbe.enabled }}
57+
readinessProbe:
58+
httpGet:
59+
port: 8181
60+
path: "/health?bundles"
61+
{{- toYaml (omit .Values.cvat.opa.readinessProbe "enabled") | nindent 12 }}
62+
{{- end }}
63+
{{- if .Values.cvat.opa.livenessProbe.enabled }}
64+
livenessProbe:
65+
httpGet:
66+
port: 8181
67+
path: "/health?bundles"
68+
{{- toYaml (omit .Values.cvat.opa.livenessProbe "enabled") | nindent 12 }}
69+
{{- end }}
5670
{{- with .Values.cvat.opa.additionalVolumeMounts }}
5771
volumeMounts:
5872
{{- toYaml . | nindent 10 }}

helm-chart/values.yaml

+38
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,21 @@ cvat:
4242
additionalEnv: []
4343
additionalVolumes: []
4444
additionalVolumeMounts: []
45+
readinessProbe:
46+
enabled: true
47+
periodSeconds: 15
48+
initialDelaySeconds: 15
49+
livenessProbe:
50+
enabled: true
51+
periodSeconds: 15
52+
failureThreshold: 5
53+
initialDelaySeconds: 60
4554
worker:
55+
livenessProbe:
56+
enabled: true
57+
periodSeconds: 120
58+
initialDelaySeconds: 30
59+
timeoutSeconds: 10
4660
export:
4761
replicas: 2
4862
labels: {}
@@ -172,6 +186,14 @@ cvat:
172186
# - mountPath: /tmp
173187
# name: tmp
174188
# subPath: test
189+
readinessProbe:
190+
enabled: true
191+
periodSeconds: 10
192+
failureThreshold: 5
193+
livenessProbe:
194+
enabled: true
195+
periodSeconds: 10
196+
failureThreshold: 5
175197
service:
176198
type: ClusterIP
177199
ports:
@@ -216,6 +238,14 @@ cvat:
216238
# name: tmp
217239
# subPath: test
218240
composeCompatibleServiceName: true # Sets service name to opa in order to be compatible with Docker Compose. Necessary because changing IAM_OPA_DATA_URL via environment variables in current images. Hinders multiple deployment due to duplicate name
241+
readinessProbe:
242+
enabled: true
243+
periodSeconds: 15
244+
initialDelaySeconds: 15
245+
livenessProbe:
246+
enabled: true
247+
periodSeconds: 15
248+
initialDelaySeconds: 15
219249
service:
220250
type: ClusterIP
221251
ports:
@@ -266,6 +296,14 @@ cvat:
266296
# - mountPath: /tmp
267297
# name: tmp
268298
# subPath: test
299+
readinessProbe:
300+
enabled: true
301+
periodSeconds: 10
302+
initialDelaySeconds: 30
303+
livenessProbe:
304+
enabled: true
305+
periodSeconds: 10
306+
initialDelaySeconds: 30
269307
defaultStorage:
270308
enabled: true
271309
# storageClassName: default

supervisord/utils.conf

+2-1
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,14 @@ command=%(ENV_HOME)s/wait_for_deps.sh
2424
-i 30 --path %(ENV_HOME)s
2525
environment=VECTOR_EVENT_HANDLER="SynchronousLogstashHandler"
2626
numprocs=1
27+
autorestart=true
2728

2829
[program:rqworker-notifications]
2930
command=%(ENV_HOME)s/wait_for_deps.sh
3031
python3 %(ENV_HOME)s/manage.py rqworker -v 3 notifications
3132
--worker-class cvat.rqworker.DefaultWorker
3233
environment=VECTOR_EVENT_HANDLER="SynchronousLogstashHandler",CVAT_POSTGRES_APPLICATION_NAME="cvat:worker:notifications"
33-
numprocs=1
34+
numprocs=%(ENV_NUMPROCS)s
3435
autorestart=true
3536

3637
[program:rqworker-cleaning]

tests/python/shared/fixtures/init.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -239,12 +239,32 @@ def kube_restore_clickhouse_db():
239239
)
240240

241241

242+
def _get_redis_inmem_keys_to_keep():
243+
return ("rq:worker:", "rq:workers", "rq:scheduler_instance:", "rq:queues:")
244+
245+
242246
def docker_restore_redis_inmem():
243-
docker_exec_redis_inmem(["redis-cli", "-e", "flushall"])
247+
docker_exec_redis_inmem(
248+
[
249+
"sh",
250+
"-c",
251+
'redis-cli -e --scan --pattern "*" |'
252+
'grep -v "' + r"\|".join(_get_redis_inmem_keys_to_keep()) + '" |'
253+
"xargs -r redis-cli -e del",
254+
]
255+
)
244256

245257

246258
def kube_restore_redis_inmem():
247-
kube_exec_redis_inmem(["sh", "-c", 'redis-cli -e -a "${REDIS_PASSWORD}" flushall'])
259+
kube_exec_redis_inmem(
260+
[
261+
"sh",
262+
"-c",
263+
'redis-cli -e -a "${REDIS_PASSWORD}" --scan --pattern "*" |'
264+
'grep -v "' + r"\|".join(_get_redis_inmem_keys_to_keep()) + '" |'
265+
'xargs -r redis-cli -e -a "${REDIS_PASSWORD}" del',
266+
]
267+
)
248268

249269

250270
def docker_restore_redis_ondisk():

0 commit comments

Comments
 (0)