llm-d-incubation
diff --git a/‎.github/workflows/ci-pr-checks.yaml‎
Lines changed: 6 additions & 0 deletions b/‎.github/workflows/ci-pr-checks.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎charts/async-processor/dashboards/async-processor.json‎
Lines changed: 189 additions & 0 deletions b/‎charts/async-processor/dashboards/async-processor.json‎
Lines changed: 189 additions & 0 deletions
diff --git a/‎charts/async-processor/templates/ap-deployments.yaml‎
Lines changed: 12 additions & 0 deletions b/‎charts/async-processor/templates/ap-deployments.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎charts/async-processor/templates/ap-podmonitor.yaml‎
Lines changed: 20 additions & 0 deletions b/‎charts/async-processor/templates/ap-podmonitor.yaml‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎charts/async-processor/templates/grafana-dashboards-configmap.yaml‎
Lines changed: 12 additions & 0 deletions b/‎charts/async-processor/templates/grafana-dashboards-configmap.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎charts/async-processor/templates/prometheus-rule.yaml‎
Lines changed: 84 additions & 0 deletions b/‎charts/async-processor/templates/prometheus-rule.yaml‎
Lines changed: 84 additions & 0 deletions
@@ -112,6 +112,12 @@ jobs:
       - name: Lint Helm chart
         run: helm lint charts/async-processor
 
+      - name: Install helm-unittest plugin
+        run: helm plugin install --version v1.0.3 --verify=false https://github.com/helm-unittest/helm-unittest.git
+
+      - name: Run Helm unit tests
+        run: helm unittest charts/async-processor
+
   # Container: build (no push) to validate Dockerfile
   container-build:
     runs-on: ubuntu-latest
 
@@ -0,0 +1,189 @@
+{
+  "annotations": {
+    "list": []
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "links": [],
+  "panels": [
+    {
+      "title": "Request Rate",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 0 },
+      "targets": [
+        {
+          "expr": "sum(rate(llm_d_async_async_request_total{namespace=\"$namespace\"}[$__rate_interval]))",
+          "legendFormat": "total"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": { "drawStyle": "line", "fillOpacity": 10 }
+        }
+      }
+    },
+    {
+      "title": "Request Rate by Queue",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 0 },
+      "targets": [
+        {
+          "expr": "sum by (queue_name) (rate(llm_d_async_async_request_total{namespace=\"$namespace\"}[$__rate_interval]))",
+          "legendFormat": "{{ queue_name }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": { "drawStyle": "line", "fillOpacity": 10 }
+        }
+      }
+    },
+    {
+      "title": "Request Outcome Breakdown",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 },
+      "targets": [
+        {
+          "expr": "sum(rate(llm_d_async_async_successful_requests_total{namespace=\"$namespace\"}[$__rate_interval]))",
+          "legendFormat": "successful"
+        },
+        {
+          "expr": "sum(rate(llm_d_async_async_failed_requests_total{namespace=\"$namespace\"}[$__rate_interval]))",
+          "legendFormat": "failed"
+        },
+        {
+          "expr": "sum(rate(llm_d_async_async_request_retries_total{namespace=\"$namespace\"}[$__rate_interval]))",
+          "legendFormat": "retries"
+        },
+        {
+          "expr": "sum(rate(llm_d_async_async_exceeded_deadline_requests_total{namespace=\"$namespace\"}[$__rate_interval]))",
+          "legendFormat": "deadline exceeded"
+        },
+        {
+          "expr": "sum(rate(llm_d_async_async_shedded_requests_total{namespace=\"$namespace\"}[$__rate_interval]))",
+          "legendFormat": "shedded"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": { "drawStyle": "line", "fillOpacity": 10 }
+        }
+      }
+    },
+    {
+      "title": "Success Rate",
+      "type": "gauge",
+      "gridPos": { "h": 8, "w": 6, "x": 12, "y": 8 },
+      "targets": [
+        {
+          "expr": "sum(rate(llm_d_async_async_successful_requests_total{namespace=\"$namespace\"}[$__rate_interval])) / sum(rate(llm_d_async_async_request_total{namespace=\"$namespace\"}[$__rate_interval]))",
+          "legendFormat": "success rate",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "thresholds": {
+            "steps": [
+              { "color": "red", "value": null },
+              { "color": "orange", "value": 0.9 },
+              { "color": "green", "value": 0.95 }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Retry Rate",
+      "type": "gauge",
+      "gridPos": { "h": 8, "w": 6, "x": 18, "y": 8 },
+      "targets": [
+        {
+          "expr": "sum(rate(llm_d_async_async_request_retries_total{namespace=\"$namespace\"}[$__rate_interval])) / sum(rate(llm_d_async_async_request_total{namespace=\"$namespace\"}[$__rate_interval]))",
+          "legendFormat": "retry rate",
+          "instant": true
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "thresholds": {
+            "steps": [
+              { "color": "green", "value": null },
+              { "color": "orange", "value": 0.3 },
+              { "color": "red", "value": 0.5 }
+            ]
+          }
+        }
+      }
+    },
+    {
+      "title": "Message Latency (p50 / p95 / p99)",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum by (le) (rate(llm_d_async_async_message_latency_time_millis_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
+          "legendFormat": "p50"
+        },
+        {
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(llm_d_async_async_message_latency_time_millis_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
+          "legendFormat": "p95"
+        },
+        {
+          "expr": "histogram_quantile(0.99, sum by (le) (rate(llm_d_async_async_message_latency_time_millis_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
+          "legendFormat": "p99"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "custom": { "drawStyle": "line", "fillOpacity": 10 }
+        }
+      }
+    },
+    {
+      "title": "Message Latency by Queue (p95)",
+      "type": "timeseries",
+      "gridPos": { "h": 8, "w": 12, "x": 12, "y": 16 },
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.95, sum by (le, queue_name) (rate(llm_d_async_async_message_latency_time_millis_bucket{namespace=\"$namespace\"}[$__rate_interval])))",
+          "legendFormat": "{{ queue_name }}"
+        }
+      ],
+      "fieldConfig": {
+        "defaults": {
+          "unit": "ms",
+          "custom": { "drawStyle": "line", "fillOpacity": 10 }
+        }
+      }
+    }
+  ],
+  "schemaVersion": 39,
+  "templating": {
+    "list": [
+      {
+        "name": "namespace",
+        "type": "query",
+        "datasource": { "type": "prometheus" },
+        "query": "label_values(llm_d_async_async_request_total, namespace)",
+        "refresh": 2,
+        "includeAll": false,
+        "current": {}
+      }
+    ]
+  },
+  "time": { "from": "now-1h", "to": "now" },
+  "title": "Async Processor",
+  "uid": "async-processor"
+}
@@ -124,6 +124,18 @@ spec:
                 key: {{ include "async-processor.redisSecretKey" . }}
           {{- end }}
         name: async-processor
+        ports:
+          - name: metrics
+            containerPort: {{ .Values.ap.metrics.port | default 9090 }}
+            protocol: TCP
+        {{- with .Values.ap.securityContext }}
+        securityContext:
+          {{- toYaml . | nindent 12 }}
+        {{- end }}
+        {{- with .Values.ap.resources }}
+        resources:
+          {{- toYaml . | nindent 12 }}
+        {{- end }}
         {{- with .Values.ap.tls.secretName }}
         volumeMounts:
           - name: tls-certs
 
@@ -0,0 +1,20 @@
+{{- if .Values.ap.podMonitor.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: {{ include "async-processor.fullname" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "async-processor.labels" . | nindent 4 }}
+    {{- with .Values.ap.podMonitor.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "async-processor.selectorLabels" . | nindent 6 }}
+  podMetricsEndpoints:
+    - port: metrics
+      path: /metrics
+      interval: {{ .Values.ap.podMonitor.interval | quote }}
+{{- end }}
@@ -0,0 +1,12 @@
+{{- if .Values.ap.grafana.dashboards.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "async-processor.fullname" . }}-dashboards
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "async-processor.labels" . | nindent 4 }}
+    {{ .Values.ap.grafana.dashboards.label }}: {{ .Values.ap.grafana.dashboards.labelValue | quote }}
+data:
+  {{- (.Files.Glob "dashboards/*.json").AsConfig | nindent 2 }}
+{{- end }}
@@ -0,0 +1,84 @@
+{{- if .Values.ap.prometheusRule.enabled }}
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: {{ include "async-processor.fullname" . }}
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "async-processor.labels" . | nindent 4 }}
+    {{- with .Values.ap.prometheusRule.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+spec:
+  groups:
+    - name: async-processor.rules
+      rules:
+        {{- if .Values.ap.prometheusRule.rules.highRetryRate.enabled }}
+        - alert: AsyncProcessorHighRetryRate
+          expr: |
+            (
+              sum(rate(llm_d_async_async_request_retries_total{namespace="{{ .Release.Namespace }}"}[5m]))
+              /
+              sum(rate(llm_d_async_async_request_total{namespace="{{ .Release.Namespace }}"}[5m]))
+            ) > {{ .Values.ap.prometheusRule.rules.highRetryRate.threshold }}
+          for: {{ .Values.ap.prometheusRule.rules.highRetryRate.for | quote }}
+          labels:
+            severity: warning
+          annotations:
+            summary: Async processor retry rate is high
+            description: >-
+              Retry rate exceeds {{ .Values.ap.prometheusRule.rules.highRetryRate.threshold }} (ratio)
+              for async requests in namespace {{ "{{ $labels.namespace }}" }}.
+        {{- end }}
+        {{- if .Values.ap.prometheusRule.rules.highDeadlineExceededRate.enabled }}
+        - alert: AsyncProcessorHighDeadlineExceededRate
+          expr: |
+            (
+              sum(rate(llm_d_async_async_exceeded_deadline_requests_total{namespace="{{ .Release.Namespace }}"}[5m]))
+              /
+              sum(rate(llm_d_async_async_request_total{namespace="{{ .Release.Namespace }}"}[5m]))
+            ) > {{ .Values.ap.prometheusRule.rules.highDeadlineExceededRate.threshold }}
+          for: {{ .Values.ap.prometheusRule.rules.highDeadlineExceededRate.for | quote }}
+          labels:
+            severity: warning
+          annotations:
+            summary: Async processor deadline exceeded rate is high
+            description: >-
+              Deadline exceeded rate exceeds {{ .Values.ap.prometheusRule.rules.highDeadlineExceededRate.threshold }} (ratio)
+              for async requests in namespace {{ "{{ $labels.namespace }}" }}.
+        {{- end }}
+        {{- if .Values.ap.prometheusRule.rules.lowSuccessRate.enabled }}
+        - alert: AsyncProcessorLowSuccessRate
+          expr: |
+            (
+              sum(rate(llm_d_async_async_successful_requests_total{namespace="{{ .Release.Namespace }}"}[5m]))
+              /
+              sum(rate(llm_d_async_async_request_total{namespace="{{ .Release.Namespace }}"}[5m]))
+            ) < {{ .Values.ap.prometheusRule.rules.lowSuccessRate.threshold }}
+          for: {{ .Values.ap.prometheusRule.rules.lowSuccessRate.for | quote }}
+          labels:
+            severity: critical
+          annotations:
+            summary: Async processor success rate is low
+            description: >-
+              Success rate is below {{ .Values.ap.prometheusRule.rules.lowSuccessRate.threshold }} (ratio)
+              for async requests in namespace {{ "{{ $labels.namespace }}" }}.
+        {{- end }}
+        {{- if .Values.ap.prometheusRule.rules.highShedRate.enabled }}
+        - alert: AsyncProcessorHighShedRate
+          expr: |
+            (
+              sum(rate(llm_d_async_async_shedded_requests_total{namespace="{{ .Release.Namespace }}"}[5m]))
+              /
+              sum(rate(llm_d_async_async_request_total{namespace="{{ .Release.Namespace }}"}[5m]))
+            ) > {{ .Values.ap.prometheusRule.rules.highShedRate.threshold }}
+          for: {{ .Values.ap.prometheusRule.rules.highShedRate.for | quote }}
+          labels:
+            severity: warning
+          annotations:
+            summary: Async processor shed rate is high
+            description: >-
+              Shed rate exceeds {{ .Values.ap.prometheusRule.rules.highShedRate.threshold }} (ratio)
+              for async requests in namespace {{ "{{ $labels.namespace }}" }}.
+        {{- end }}
+{{- end }}