Skip to content

Commit c2a6d7d

Browse files
authored
helm: hardening and testing for helm charts (#240)
* helm: hardening and testing for helm charts Signed-off-by: Edoardo Vacchi <evacchi@users.noreply.github.com> * pin version helm unittest plugin to same version as batch-gateway Signed-off-by: Edoardo Vacchi <evacchi@users.noreply.github.com> * address comments Signed-off-by: Edoardo Vacchi <evacchi@users.noreply.github.com> --------- Signed-off-by: Edoardo Vacchi <evacchi@users.noreply.github.com>
1 parent 938cd44 commit c2a6d7d

11 files changed

Lines changed: 763 additions & 2 deletions

File tree

.github/workflows/ci-pr-checks.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,12 @@ jobs:
112112
- name: Lint Helm chart
113113
run: helm lint charts/async-processor
114114

115+
- name: Install helm-unittest plugin
116+
run: helm plugin install --version v1.0.3 --verify=false https://github.com/helm-unittest/helm-unittest.git
117+
118+
- name: Run Helm unit tests
119+
run: helm unittest charts/async-processor
120+
115121
# Container: build (no push) to validate Dockerfile
116122
container-build:
117123
runs-on: ubuntu-latest
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
{
2+
"annotations": {
3+
"list": []
4+
},
5+
"editable": true,
6+
"fiscalYearStartMonth": 0,
7+
"graphTooltip": 1,
8+
"links": [],
9+
"panels": [
10+
{
11+
"title": "Request Rate",
12+
"type": "timeseries",
13+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
14+
"targets": [
15+
{
16+
"expr": "sum(rate(llm_d_async_async_request_total{namespace=\"$namespace\"}[$__rate_interval]))",
17+
"legendFormat": "total"
18+
}
19+
],
20+
"fieldConfig": {
21+
"defaults": {
22+
"unit": "reqps",
23+
"custom": { "drawStyle": "line", "fillOpacity": 10 }
24+
}
25+
}
26+
},
27+
{
28+
"title": "Request Rate by Queue",
29+
"type": "timeseries",
30+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
31+
"targets": [
32+
{
33+
"expr": "sum by (queue_name) (rate(llm_d_async_async_request_total{namespace=\"$namespace\"}[$__rate_interval]))",
34+
"legendFormat": "{{ queue_name }}"
35+
}
36+
],
37+
"fieldConfig": {
38+
"defaults": {
39+
"unit": "reqps",
40+
"custom": { "drawStyle": "line", "fillOpacity": 10 }
41+
}
42+
}
43+
},
44+
{
45+
"title": "Request Outcome Breakdown",
46+
"type": "timeseries",
47+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
48+
"targets": [
49+
{
50+
"expr": "sum(rate(llm_d_async_async_successful_requests_total{namespace=\"$namespace\"}[$__rate_interval]))",
51+
"legendFormat": "successful"
52+
},
53+
{
54+
"expr": "sum(rate(llm_d_async_async_failed_requests_total{namespace=\"$namespace\"}[$__rate_interval]))",
55+
"legendFormat": "failed"
56+
},
57+
{
58+
"expr": "sum(rate(llm_d_async_async_request_retries_total{namespace=\"$namespace\"}[$__rate_interval]))",
59+
"legendFormat": "retries"
60+
},
61+
{
62+
"expr": "sum(rate(llm_d_async_async_exceeded_deadline_requests_total{namespace=\"$namespace\"}[$__rate_interval]))",
63+
"legendFormat": "deadline exceeded"
64+
},
65+
{
66+
"expr": "sum(rate(llm_d_async_async_shedded_requests_total{namespace=\"$namespace\"}[$__rate_interval]))",
67+
"legendFormat": "shedded"
68+
}
69+
],
70+
"fieldConfig": {
71+
"defaults": {
72+
"unit": "reqps",
73+
"custom": { "drawStyle": "line", "fillOpacity": 10 }
74+
}
75+
}
76+
},
77+
{
78+
"title": "Success Rate",
79+
"type": "gauge",
80+
"gridPos": { "h": 8, "w": 6, "x": 12, "y": 8 },
81+
"targets": [
82+
{
83+
"expr": "sum(rate(llm_d_async_async_successful_requests_total{namespace=\"$namespace\"}[$__rate_interval])) / sum(rate(llm_d_async_async_request_total{namespace=\"$namespace\"}[$__rate_interval]))",
84+
"legendFormat": "success rate",
85+
"instant": true
86+
}
87+
],
88+
"fieldConfig": {
89+
"defaults": {
90+
"unit": "percentunit",
91+
"min": 0,
92+
"max": 1,
93+
"thresholds": {
94+
"steps": [
95+
{ "color": "red", "value": null },
96+
{ "color": "orange", "value": 0.9 },
97+
{ "color": "green", "value": 0.95 }
98+
]
99+
}
100+
}
101+
}
102+
},
103+
{
104+
"title": "Retry Rate",
105+
"type": "gauge",
106+
"gridPos": { "h": 8, "w": 6, "x": 18, "y": 8 },
107+
"targets": [
108+
{
109+
"expr": "sum(rate(llm_d_async_async_request_retries_total{namespace=\"$namespace\"}[$__rate_interval])) / sum(rate(llm_d_async_async_request_total{namespace=\"$namespace\"}[$__rate_interval]))",
110+
"legendFormat": "retry rate",
111+
"instant": true
112+
}
113+
],
114+
"fieldConfig": {
115+
"defaults": {
116+
"unit": "percentunit",
117+
"min": 0,
118+
"max": 1,
119+
"thresholds": {
120+
"steps": [
121+
{ "color": "green", "value": null },
122+
{ "color": "orange", "value": 0.3 },
123+
{ "color": "red", "value": 0.5 }
124+
]
125+
}
126+
}
127+
}
128+
},
129+
{
130+
"title": "Message Latency (p50 / p95 / p99)",
131+
"type": "timeseries",
132+
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
133+
"targets": [
134+
{
135+
"expr": "histogram_quantile(0.50, sum by (le) (rate(llm_d_async_async_message_latency_time_millis_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
136+
"legendFormat": "p50"
137+
},
138+
{
139+
"expr": "histogram_quantile(0.95, sum by (le) (rate(llm_d_async_async_message_latency_time_millis_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
140+
"legendFormat": "p95"
141+
},
142+
{
143+
"expr": "histogram_quantile(0.99, sum by (le) (rate(llm_d_async_async_message_latency_time_millis_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
144+
"legendFormat": "p99"
145+
}
146+
],
147+
"fieldConfig": {
148+
"defaults": {
149+
"unit": "ms",
150+
"custom": { "drawStyle": "line", "fillOpacity": 10 }
151+
}
152+
}
153+
},
154+
{
155+
"title": "Message Latency by Queue (p95)",
156+
"type": "timeseries",
157+
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
158+
"targets": [
159+
{
160+
"expr": "histogram_quantile(0.95, sum by (le, queue_name) (rate(llm_d_async_async_message_latency_time_millis_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
161+
"legendFormat": "{{ queue_name }}"
162+
}
163+
],
164+
"fieldConfig": {
165+
"defaults": {
166+
"unit": "ms",
167+
"custom": { "drawStyle": "line", "fillOpacity": 10 }
168+
}
169+
}
170+
}
171+
],
172+
"schemaVersion": 39,
173+
"templating": {
174+
"list": [
175+
{
176+
"name": "namespace",
177+
"type": "query",
178+
"datasource": { "type": "prometheus" },
179+
"query": "label_values(llm_d_async_async_request_total, namespace)",
180+
"refresh": 2,
181+
"includeAll": false,
182+
"current": {}
183+
}
184+
]
185+
},
186+
"time": { "from": "now-1h", "to": "now" },
187+
"title": "Async Processor",
188+
"uid": "async-processor"
189+
}

charts/async-processor/templates/ap-deployments.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,18 @@ spec:
124124
key: {{ include "async-processor.redisSecretKey" . }}
125125
{{- end }}
126126
name: async-processor
127+
ports:
128+
- name: metrics
129+
containerPort: {{ .Values.ap.metrics.port | default 9090 }}
130+
protocol: TCP
131+
{{- with .Values.ap.securityContext }}
132+
securityContext:
133+
{{- toYaml . | nindent 12 }}
134+
{{- end }}
135+
{{- with .Values.ap.resources }}
136+
resources:
137+
{{- toYaml . | nindent 12 }}
138+
{{- end }}
127139
{{- with .Values.ap.tls.secretName }}
128140
volumeMounts:
129141
- name: tls-certs
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
{{- if .Values.ap.podMonitor.enabled }}
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PodMonitor
4+
metadata:
5+
name: {{ include "async-processor.fullname" . }}
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
{{- include "async-processor.labels" . | nindent 4 }}
9+
{{- with .Values.ap.podMonitor.labels }}
10+
{{- toYaml . | nindent 4 }}
11+
{{- end }}
12+
spec:
13+
selector:
14+
matchLabels:
15+
{{- include "async-processor.selectorLabels" . | nindent 6 }}
16+
podMetricsEndpoints:
17+
- port: metrics
18+
path: /metrics
19+
interval: {{ .Values.ap.podMonitor.interval | quote }}
20+
{{- end }}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{{- if .Values.ap.grafana.dashboards.enabled }}
2+
apiVersion: v1
3+
kind: ConfigMap
4+
metadata:
5+
name: {{ include "async-processor.fullname" . }}-dashboards
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
{{- include "async-processor.labels" . | nindent 4 }}
9+
{{ .Values.ap.grafana.dashboards.label }}: {{ .Values.ap.grafana.dashboards.labelValue | quote }}
10+
data:
11+
{{- (.Files.Glob "dashboards/*.json").AsConfig | nindent 2 }}
12+
{{- end }}
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
{{- if .Values.ap.prometheusRule.enabled }}
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: {{ include "async-processor.fullname" . }}
6+
namespace: {{ .Release.Namespace }}
7+
labels:
8+
{{- include "async-processor.labels" . | nindent 4 }}
9+
{{- with .Values.ap.prometheusRule.labels }}
10+
{{- toYaml . | nindent 4 }}
11+
{{- end }}
12+
spec:
13+
groups:
14+
- name: async-processor.rules
15+
rules:
16+
{{- if .Values.ap.prometheusRule.rules.highRetryRate.enabled }}
17+
- alert: AsyncProcessorHighRetryRate
18+
expr: |
19+
(
20+
sum(rate(llm_d_async_async_request_retries_total{namespace="{{ .Release.Namespace }}"}[5m]))
21+
/
22+
sum(rate(llm_d_async_async_request_total{namespace="{{ .Release.Namespace }}"}[5m]))
23+
) > {{ .Values.ap.prometheusRule.rules.highRetryRate.threshold }}
24+
for: {{ .Values.ap.prometheusRule.rules.highRetryRate.for | quote }}
25+
labels:
26+
severity: warning
27+
annotations:
28+
summary: Async processor retry rate is high
29+
description: >-
30+
Retry rate exceeds {{ .Values.ap.prometheusRule.rules.highRetryRate.threshold }} (ratio)
31+
for async requests in namespace {{ "{{ $labels.namespace }}" }}.
32+
{{- end }}
33+
{{- if .Values.ap.prometheusRule.rules.highDeadlineExceededRate.enabled }}
34+
- alert: AsyncProcessorHighDeadlineExceededRate
35+
expr: |
36+
(
37+
sum(rate(llm_d_async_async_exceeded_deadline_requests_total{namespace="{{ .Release.Namespace }}"}[5m]))
38+
/
39+
sum(rate(llm_d_async_async_request_total{namespace="{{ .Release.Namespace }}"}[5m]))
40+
) > {{ .Values.ap.prometheusRule.rules.highDeadlineExceededRate.threshold }}
41+
for: {{ .Values.ap.prometheusRule.rules.highDeadlineExceededRate.for | quote }}
42+
labels:
43+
severity: warning
44+
annotations:
45+
summary: Async processor deadline exceeded rate is high
46+
description: >-
47+
Deadline exceeded rate exceeds {{ .Values.ap.prometheusRule.rules.highDeadlineExceededRate.threshold }} (ratio)
48+
for async requests in namespace {{ "{{ $labels.namespace }}" }}.
49+
{{- end }}
50+
{{- if .Values.ap.prometheusRule.rules.lowSuccessRate.enabled }}
51+
- alert: AsyncProcessorLowSuccessRate
52+
expr: |
53+
(
54+
sum(rate(llm_d_async_async_successful_requests_total{namespace="{{ .Release.Namespace }}"}[5m]))
55+
/
56+
sum(rate(llm_d_async_async_request_total{namespace="{{ .Release.Namespace }}"}[5m]))
57+
) < {{ .Values.ap.prometheusRule.rules.lowSuccessRate.threshold }}
58+
for: {{ .Values.ap.prometheusRule.rules.lowSuccessRate.for | quote }}
59+
labels:
60+
severity: critical
61+
annotations:
62+
summary: Async processor success rate is low
63+
description: >-
64+
Success rate is below {{ .Values.ap.prometheusRule.rules.lowSuccessRate.threshold }} (ratio)
65+
for async requests in namespace {{ "{{ $labels.namespace }}" }}.
66+
{{- end }}
67+
{{- if .Values.ap.prometheusRule.rules.highShedRate.enabled }}
68+
- alert: AsyncProcessorHighShedRate
69+
expr: |
70+
(
71+
sum(rate(llm_d_async_async_shedded_requests_total{namespace="{{ .Release.Namespace }}"}[5m]))
72+
/
73+
sum(rate(llm_d_async_async_request_total{namespace="{{ .Release.Namespace }}"}[5m]))
74+
) > {{ .Values.ap.prometheusRule.rules.highShedRate.threshold }}
75+
for: {{ .Values.ap.prometheusRule.rules.highShedRate.for | quote }}
76+
labels:
77+
severity: warning
78+
annotations:
79+
summary: Async processor shed rate is high
80+
description: >-
81+
Shed rate exceeds {{ .Values.ap.prometheusRule.rules.highShedRate.threshold }} (ratio)
82+
for async requests in namespace {{ "{{ $labels.namespace }}" }}.
83+
{{- end }}
84+
{{- end }}

0 commit comments

Comments
 (0)