From b60322c1701ae9ff3587f2affe801d9f75c40a72 Mon Sep 17 00:00:00 2001 From: sallyom Date: Fri, 4 Apr 2025 12:36:38 -0400 Subject: [PATCH 1/5] add manifests and documentation for observability Signed-off-by: sallyom --- kubernetes/observability/README.md | 122 + .../cluster-metrics.yaml | 13 + .../cluster_metrics.ocp.json | 2178 +++++++++++++++++ .../observability/grafana/deploy-grafana.sh | 18 + .../02-grafana-instance.yaml | 20 + .../02-grafana-sa-token-secret.yaml | 7 + .../02-grafana-serviceaccount.yaml | 4 + .../03-grafana-route.yaml | 14 + .../04-grafana-datasources.yaml | 56 + kubernetes/observability/minio-user-cred.yaml | 10 + .../otel-collector/clusterrole.yaml | 29 + .../otel-collector/kustomization.yaml | 8 + .../otel-collector-vllm-sidecar.yaml | 60 + .../otel-collector/otel-collector.yaml | 129 + .../observability/otel-collector/sa.yaml | 5 + .../observability/podmonitor-example-0.yaml | 20 + .../observability/podmonitor-example-1.yaml | 15 + .../observability/servicemonitor-example.yaml | 13 + .../observability/tempo/kustomization.yaml | 11 + .../tempo/minio-secret-tempo.yaml | 11 + .../observability/tempo/minio-tempo-pvc.yaml | 16 + .../observability/tempo/minio-tempo-svc.yaml | 13 + .../observability/tempo/minio-tempo.yaml | 54 + .../tempo/tempo-multitenant.yaml | 27 + .../observability/tempo/tempo-role.yaml | 28 + .../observability/tracing-ui-plugin.yaml | 8 + 26 files changed, 2889 insertions(+) create mode 100644 kubernetes/observability/README.md create mode 100644 kubernetes/observability/grafana/cluster-metrics-dashboard/cluster-metrics.yaml create mode 100644 kubernetes/observability/grafana/cluster-metrics-dashboard/cluster_metrics.ocp.json create mode 100755 kubernetes/observability/grafana/deploy-grafana.sh create mode 100644 kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-instance.yaml create mode 100644 kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-sa-token-secret.yaml create mode 100644 kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-serviceaccount.yaml create mode 100644 kubernetes/observability/grafana/instance-with-prom-tempo-ds/03-grafana-route.yaml create mode 100644 kubernetes/observability/grafana/instance-with-prom-tempo-ds/04-grafana-datasources.yaml create mode 100644 kubernetes/observability/minio-user-cred.yaml create mode 100644 kubernetes/observability/otel-collector/clusterrole.yaml create mode 100644 kubernetes/observability/otel-collector/kustomization.yaml create mode 100644 kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml create mode 100644 kubernetes/observability/otel-collector/otel-collector.yaml create mode 100644 kubernetes/observability/otel-collector/sa.yaml create mode 100644 kubernetes/observability/podmonitor-example-0.yaml create mode 100644 kubernetes/observability/podmonitor-example-1.yaml create mode 100644 kubernetes/observability/servicemonitor-example.yaml create mode 100644 kubernetes/observability/tempo/kustomization.yaml create mode 100644 kubernetes/observability/tempo/minio-secret-tempo.yaml create mode 100644 kubernetes/observability/tempo/minio-tempo-pvc.yaml create mode 100644 kubernetes/observability/tempo/minio-tempo-svc.yaml create mode 100644 kubernetes/observability/tempo/minio-tempo.yaml create mode 100644 kubernetes/observability/tempo/tempo-multitenant.yaml create mode 100644 kubernetes/observability/tempo/tempo-role.yaml create mode 100644 kubernetes/observability/tracing-ui-plugin.yaml diff --git a/kubernetes/observability/README.md b/kubernetes/observability/README.md new file mode 100644 index 000000000..0ffac12d3 --- /dev/null +++ b/kubernetes/observability/README.md @@ -0,0 +1,122 @@ +# Monitor Llamastack & vLLM in OpenShift + +Follow this README to configure an observability stack in OpenShift to visualize Llamastack telemetry and vLLM metrics. + +## OpenShift Observability Operators + +Operators are available from OperatorHub +The following operators must be installed in order to proceed with this example. + +### Operator descriptions + +1. **Red Hat Build of OpenTelemetry**: The OpenTelemetry Collector (OTC) is provided from this operator. +Metrics and traces will be distributed from the OTC to various backends. Tempo is deployed and is the tracing backend. + +2. **Tempo Operator**: Provides `TempoStack` Custom Resource. This is the backend for distributed tracing. +An S3-compatible storage (Minio) is paired with Tempo. + +3. **Cluster Observability Operator**: This provides PodMonitor and ServiceMonitor Custom Resources which are necessary for +user-workload monitoring's prometheus to scrape workload metrics. Also, the COO provides UIPlugins for viewing telemetry. + +3. **(optional) Grafana Operator**: Provides Grafana APIs including `GrafanaDashboard`, `Grafana`, and `GrafanaDataSource` that will be used to visualize telemetry. + +## Create PodMonitor or ServiceMonitor for any AI Workload that exposes a metrics endpoint + +This is how to enable collection of user-workload metrics for any workload within OpenShift. You need to create a `PodMonitor` or a `ServiceMonitor`. +The PodMonitor will ensure all metrics from pods with matching selectors will be scraped by the user-workload-monitoring Prometheus, and a ServiceMonitor will +scrape from any pod that runs under a particular service. + +* [Example PodMonitor](./podmonitor-example-0.yaml) +* [Example ServiceMonitor](./servicemonitor-example.yaml) + +Upon creation of either, metrics will be scraped and will be visible from the console `Observe -> Metrics` dashboards. + +## Create custom resources and configurations for a central observability hub + +Create the observablity hub namespace `observability-hub`. If a different namespace is created, be sure to update the resource yamls accordingly. + +```bash +oc create ns observability-hub +``` + +### Tracing Backend (Tempo with Minio for S3 storage) + +```bash +# edit storageclassName & secret as necessary +# secret and storage for testing only +oc apply --kustomize ./tempo -n observability-hub +``` + +### OpenTelemetryCollector deployment + +OpenTelemetry Collector is used to aggregate telemetry from various workloads, process individual signals, and export +to various backends. This is used to collect traces from various workloads and export all as a single +authenticated stream to the in-cluster TempoStack. For in-cluster only, opentelemetry-collector is not necessary to collect +metrics. Metrics are sent to the in-cluster user-workload-monitoring prometheus by creating the podmonitors and servicemonitors. +However, if exporting off-cluster to a 3rd party observability vendor, the collector is necessary for all signals, +and can provide a single place with which to receive telemetry from various workloads and export as a single authenticated and +secure OTLP stream. + +To create a central opentelemetry-collector, update the +[otel-collector/otel-collector.yaml](./otel-collector/otel-collector.yaml) to match your requirements and then apply. + +```bash +oc apply --kustomize ./otel-collector -n observability-hub +``` + +### OpenTelemetryCollector Sidecars deployment + +You can add individual metrics endpoints to the central otel-collector in observability-hub, but +another way is to add otel-collector sidecar containers to individual deployments throughout the +cluster. Paired with an annotation on the deployment, telemetry will be exported as configured. +Any deployment with the annotation below will receive and export telemetry as configured in the +[otel-collector-vllm-sidecar.yaml](./otel-collector/otel-collector-vllm-sidecar.yaml). + +The example here will add an otel-collector sidecar custom resource to the `llama-serve` namespace, +and to trigger a sidecar container, annotate any deployment's `template.metadata.annotations` with: +`sidecar.opentelemetry.io/inject: vllm-otelsidecar` + +```bash +oc apply -f ./otel-collector/otel-collector-vllm-sidecar.yaml + +# Then, annotate whatever vllm deployment you'd like to collect metrics from +# Or, add the annotation to the deployment's `template.metadata.annotations` from the console. +oc patch deployment \ + -n \ + --type='merge' \ + -p '{"spec":{"template":{"metadata":{"annotations":{"sidecar.opentelemetry.io/inject":"vllm-otelsidecar"}}}}}' +``` + +### Grafana + +This will deploy a Grafana instance, and Prometheus & Tempo DataSources +The prometheus datasource is the user-workload-monitoring prometheus running in `openshift-user-workload-monitoring` namespace. +The Grafana console is configured with `username: rhel, password: rhel` + +```bash +cd grafana +./deploy-grafana.sh +``` +Upon success, you can explore metrics and traces from Grafana route. + +#### GrafanaDashboard to visualize cluster metrics and traces + +Check out [github.com/kevchu3/openshift-4-grafana](https://github.com/kevchu3/openshift4-grafana/tree/master/dashboards/crds) for a list of +dashboards to deploy on OpenShift. + +Here's an example to download and deploy a GrafanaDashboard for OpenShift 4.16 cluster metrics. +The dashboard is slightly modified from https://github.com/kevchu3/openshift4-grafana/blob/master/dashboards/json_raw/cluster_metrics.ocp416.json + +```bash +oc apply -n observability-hub -f cluster-metrics-dashboard/cluster-metrics.yaml +``` + +### Cluster Observability Operator Tracing UIPlugin + +The Jaeger frontend feature of TempoStack is no longer supported by Red Hat. This has been replaced by the COO UIPlugin. To create the UIPlugin for +Tracing, first ensure the TempoStack described above is created. This is a prerequisite. Then, all that's necessary to view traces from +the OpenShift console at `Observe -> Traces` is to create the following [Tracing UIPlugin resource](./tracing-ui-plugin.yaml). + +```bash +oc apply ./tracing-ui-plugin.yaml +``` diff --git a/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster-metrics.yaml b/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster-metrics.yaml new file mode 100644 index 000000000..40c2da8df --- /dev/null +++ b/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster-metrics.yaml @@ -0,0 +1,13 @@ +kind: GrafanaDashboard +apiVersion: grafana.integreatly.org/v1beta1 +metadata: + name: cluster-metrics + labels: + app: grafana +spec: + instanceSelector: + matchLabels: + dashboards: grafana # This label matches the grafana Grafana instance + # This json was copied and modified from https://github.com/kevchu3/openshift4-grafana/blob/master/dashboards/json_raw/cluster_metrics.ocp416.json + url: https://raw.githubusercontent.com/redhat-et/edge-ocp-observability/refs/heads/main/observability-hub/grafana/cluster-metrics-dashboard/cluster_metrics_ocp.json + diff --git a/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster_metrics.ocp.json b/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster_metrics.ocp.json new file mode 100644 index 000000000..8f150d607 --- /dev/null +++ b/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster_metrics.ocp.json @@ -0,0 +1,2178 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 5, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": "prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 53, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Cluster Health", + "type": "row" + }, + { + "dashboardFilter": "", + "dashboardTags": [], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 51, + "limit": 10, + "nameFilter": "", + "onlyAlertsOnDashboard": true, + "options": { + "alertInstanceLabelFilter": "", + "alertName": "", + "dashboardAlerts": false, + "groupBy": [], + "groupMode": "default", + "maxItems": 20, + "sortOrder": 1, + "stateFilter": { + "error": true, + "firing": true, + "noData": false, + "normal": false, + "pending": true + }, + "viewMode": "list" + }, + "show": "current", + "sortOrder": 1, + "stateFilter": [ + "alerting", + "paused", + "no_data", + "execution_error", + "pending" + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 6, + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Alerts Dashboard", + "type": "alertlist" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 32, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(cluster:node_instance_type_count:sum{label_node_role_kubernetes_io!=\"master\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Compute Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Excludes control plane", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 34, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(cluster:capacity_cpu_cores:sum{label_node_role_kubernetes_io!=\"master\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Allocatable Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Req", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "code:cluster:ingress_http_request_count:rate5m:sum", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "HTTP {{code}}", + "refId": "A", + "step": 120 + } + ], + "title": "Cluster Http Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Cluster operators in Progress, Degraded, Failing, etc...", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Conditions", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": -0.05, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 49, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "(cluster_operator_conditions{condition!~\"Available|Upgradeable|RetrievedUpdates\"} == 1)", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{name}} ({{condition}})", + "refId": "A" + } + ], + "title": "Cluster Operators by Failed Condition", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Excludes infrastructure namespaces", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 6, + "y": 3 + }, + "id": 38, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count(kube_namespace_status_phase{phase='Active',namespace!~\"(default|kube|openshift).*\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Application Projects", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Excludes control plane", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 9, + "y": 3 + }, + "id": 36, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(cluster:capacity_memory_bytes:sum{label_node_role_kubernetes_io!=\"master\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Allocatable Memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 6, + "y": 5 + }, + "id": 40, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_status_ready{condition=\"true\",namespace!~\"(default|kube|openshift).*\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Application Pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Persistent volume claims for applications", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 9, + "y": 5 + }, + "id": 42, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_persistentvolumeclaim_resource_requests_storage_bytes {namespace!~\"(default|kube|openshift).*\"})/1e+9", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Allocated App Storage", + "type": "stat" + }, + { + "collapsed": false, + "datasource": "prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 30, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Cluster Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Containers in cluster by state, excluding Running and Completed pod status", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsNull", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 72, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (reason) (kube_pod_container_status_waiting_reason)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{reason}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (reason) (kube_pod_container_status_terminated_reason{reason=~\"ContainerCannotRun|Error|OOMKilled\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{reason}}", + "refId": "B" + } + ], + "title": "Waiting and Terminated Containers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 105, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket[5m])) by (operation_type, le))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "runtime: {{operation_type}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket[5m])) by (operation_type, le))", + "legendFormat": "cgroup: {{operation_type}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket[5m])) by (operation_type, le))", + "legendFormat": "worker: {{operation_type}}", + "refId": "C" + } + ], + "title": "Kubelet Operations Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "µs" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 66, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "avg by (operation_type) (rate(container_runtime_crio_operations_latency_seconds_total[5m]))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{operation_type}}", + "refId": "A" + } + ], + "title": "Container Runtime Operations Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 14 + }, + "id": 82, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (alertname, severity)(ALERTS{alertname!=\"Watchdog\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{alertname}} ({{severity}})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (severity)(ALERTS{severity!~\"none|info|warning\"})", + "hide": true, + "legendFormat": "Critical alerts", + "refId": "B" + } + ], + "title": "Alerts by State", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 14 + }, + "id": 100, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "code:apiserver_request_total:rate:sum", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "HTTP {{code}}", + "refId": "A" + } + ], + "title": "APIServer Requests by Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 14 + }, + "id": 101, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (verb) (rate(apiserver_request_total[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{verb}}", + "refId": "A" + } + ], + "title": "APIServer Requests by Verb", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "max": 1.05, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "transparent", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 14 + }, + "id": 104, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "etcd_server_has_leader", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Etcd Server has Leader", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": "prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 103, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Cluster Capacity", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Kubernetes schedules based on CPU and memory pod requests", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "decimals": 1, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0.8 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsNull", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 21 + }, + "id": 78, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(sum by (pod) (kube_pod_resource_request{resource='cpu',namespace!~\"(default|kube|openshift).*\"}) and count (kube_pod_status_phase{phase=~\"Running|Pending|Unknown\"} == 1) by (pod)) / sum (machine_cpu_cores)", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "cpu.requests", + "refId": "A", + "step": 120 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(sum by (pod) (kube_pod_resource_request{resource='memory',namespace!~\"(default|kube|openshift).*\"}) and count(kube_pod_status_phase{phase=~\"Running|Pending|Unknown\"} == 1) by (pod)) / sum (machine_memory_bytes)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "memory.requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(sum by (pod) (kube_pod_resource_limit{resource='cpu',namespace!~\"(default|kube|openshift).*\"}) and count (kube_pod_status_phase{phase=~\"Running|Pending|Unknown\"} == 1) by (pod)) / sum (machine_cpu_cores)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "cpu.limits", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(sum by (pod) (kube_pod_resource_limit{resource='memory',namespace!~\"(default|kube|openshift).*\"}) and count(kube_pod_status_phase{phase=~\"Running|Pending|Unknown\"} == 1) by (pod)) / sum (machine_memory_bytes)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "memory.limits", + "range": true, + "refId": "D" + } + ], + "title": "Cluster Pod Requests and Limits", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 200 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 21 + }, + "id": 47, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "kubelet_running_pods", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{node}}", + "refId": "A", + "step": 120 + } + ], + "title": "Pods per Node", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 21 + }, + "id": 91, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (namespace) (node_namespace_pod:kube_pod_info:{namespace!~\"(openshift).*\"})", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{namespace}}", + "refId": "A", + "step": 120 + } + ], + "title": "Pods per App Namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "decimals": 1, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 21 + }, + "id": 55, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "100 * sort_desc(sum (kubelet_volume_stats_used_bytes) by (persistentvolumeclaim, namespace) / sum (kubelet_volume_stats_capacity_bytes) by (persistentvolumeclaim, namespace))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}} / {{persistentvolumeclaim}}", + "refId": "A" + } + ], + "title": "Persistent Volume Claim Used %", + "type": "timeseries" + } + ], + "refresh": false, + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "query": "prometheus", + "current": { + "selected": true, + "text": "prometheus", + "value": "prometheus" + }, + "hide": 0, + "label": "Datasource", + "regex": "", + "refresh": 1, + "sort": 0 + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Cluster Metrics", + "uid": "dxkdT-eWz", + "version": 2, + "weekStart": "" +} diff --git a/kubernetes/observability/grafana/deploy-grafana.sh b/kubernetes/observability/grafana/deploy-grafana.sh new file mode 100755 index 000000000..d383dcb34 --- /dev/null +++ b/kubernetes/observability/grafana/deploy-grafana.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +# This will fail if the GrafanaOperator is not installed +# TODO: replace this with kustomize script & add the clusterrole & rolebinding yamls + +MONITORING_NS=observability-hub +SECRET=grafana-sa-token + +oc apply -f $(pwd)/instance-with-prom-tempo-ds/02-grafana-serviceaccount.yaml -n $MONITORING_NS +oc apply -f $(pwd)/instance-with-prom-tempo-ds/02-grafana-sa-token-secret.yaml -n $MONITORING_NS +oc apply -f $(pwd)/instance-with-prom-tempo-ds/02-grafana-instance.yaml -n $MONITORING_NS +oc apply -f $(pwd)/instance-with-prom-tempo-ds/03-grafana-route.yaml -n $MONITORING_NS +oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana-sa +oc adm policy add-cluster-role-to-user openshift-cluster-monitoring-view -z grafana-sa +oc adm policy add-cluster-role-to-user tempostack-traces-reader -z grafana-sa +oc adm policy add-role-to-user edit -z grafana-sa -n $MONITORING_NS +oc apply -f instance-with-prom-tempo-ds/04-grafana-datasources.yaml -n $MONITORING_NS + diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-instance.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-instance.yaml new file mode 100644 index 000000000..6adb45444 --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-instance.yaml @@ -0,0 +1,20 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + name: grafana + labels: + dashboards: grafana +spec: + config: + log: + level: warn + mode: console + security: + admin_password: "rhel" + admin_user: "rhel" + dashboardLabelSelector: + - matchExpressions: + - key: app + operator: In + values: + - grafana diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-sa-token-secret.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-sa-token-secret.yaml new file mode 100644 index 000000000..b3073e727 --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-sa-token-secret.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: grafana-sa-token + annotations: + kubernetes.io/service-account.name: grafana-sa +type: kubernetes.io/service-account-token diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-serviceaccount.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-serviceaccount.yaml new file mode 100644 index 000000000..1bd9aefbc --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-serviceaccount.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: grafana-sa diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/03-grafana-route.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/03-grafana-route.yaml new file mode 100644 index 000000000..df8a1f7e3 --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/03-grafana-route.yaml @@ -0,0 +1,14 @@ +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: grafana-route +spec: + to: + kind: Service + name: grafana-service + weight: 100 + port: + targetPort: grafana + tls: + termination: edge + wildcardPolicy: None diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/04-grafana-datasources.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/04-grafana-datasources.yaml new file mode 100644 index 000000000..04832f87e --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/04-grafana-datasources.yaml @@ -0,0 +1,56 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: prometheus +spec: + instanceSelector: + matchLabels: + dashboards: grafana # This label matches the grafana Grafana instance + datasource: + name: prometheus + access: proxy + editable: true + type: prometheus + url: "https://thanos-querier.openshift-monitoring.svc.cluster.local:9091" + isDefault: true + secureJsonData: + "httpHeaderValue1": "Bearer ${token}" + jsonData: + "httpHeaderName1": "Authorization" + "timeInterval": "5s" + "tlsSkipVerify": true + valuesFrom: + - targetPath: "secureJsonData.httpHeaderValue1" + valueFrom: + secretKeyRef: + name: "grafana-sa-token" + key: "token" +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: tempo +spec: + instanceSelector: + matchLabels: + dashboards: grafana # This label matches the grafana Grafana instance + datasource: + name: tempo + access: proxy + editable: true + type: tempo + # This is specific to "observability-hub" namespace. If running tempostack elsewhere, need to update + url: "https://tempo-tempostack-gateway-observability-hub.apps.ocp-beta-test.nerc.mghpcc.org/api/traces/v1/dev/tempo" + isDefault: false + secureJsonData: + "httpHeaderValue1": "Bearer ${token}" + jsonData: + "httpHeaderName1": "Authorization" + "timeInterval": "5s" + "tlsSkipVerify": true + valuesFrom: + - targetPath: "secureJsonData.httpHeaderValue1" + valueFrom: + secretKeyRef: + name: "grafana-sa-token" + key: "token" diff --git a/kubernetes/observability/minio-user-cred.yaml b/kubernetes/observability/minio-user-cred.yaml new file mode 100644 index 000000000..d295a7c8f --- /dev/null +++ b/kubernetes/observability/minio-user-cred.yaml @@ -0,0 +1,10 @@ +kind: Secret +apiVersion: v1 +metadata: + name: minio-user-creds +# TEST VALUES ONLY USED IN DEV TESTING +stringData: + MINIO_ROOT_USER: test + MINIO_ROOT_PASSWORD: supersecret +type: Opaque + diff --git a/kubernetes/observability/otel-collector/clusterrole.yaml b/kubernetes/observability/otel-collector/clusterrole.yaml new file mode 100644 index 000000000..7923059d2 --- /dev/null +++ b/kubernetes/observability/otel-collector/clusterrole.yaml @@ -0,0 +1,29 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tempostack-traces-write +rules: + - apiGroups: + - 'tempo.grafana.com' + resources: + - dev + resourceNames: + - traces + verbs: + - 'create' +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tempostack-traces +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tempostack-traces-write +subjects: + - kind: ServiceAccount + name: otel-collector + # update if not using observability-hub namespace + namespace: observability-hub + diff --git a/kubernetes/observability/otel-collector/kustomization.yaml b/kubernetes/observability/otel-collector/kustomization.yaml new file mode 100644 index 000000000..634d19bb3 --- /dev/null +++ b/kubernetes/observability/otel-collector/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: observability-hub +resources: +- sa.yaml +- clusterrole.yaml +- otel-collector.yaml diff --git a/kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml b/kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml new file mode 100644 index 000000000..d23ae8cc5 --- /dev/null +++ b/kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml @@ -0,0 +1,60 @@ +# Once this exists, any pod with the template.metadata.annotation below will send metrics +# to observability-hub: +# sidecar.opentelemetry.io/inject: vllm-otelsidecar +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: vllm-otelsidecar + namespace: llama-serve +spec: + observability: + metrics: {} + deploymentUpdateStrategy: {} + config: + exporters: + debug: {} + otlphttp: + # all sidecars can export to the central observability-hub otel-collector, then be + # exported to various backends from there (in-cluster, external 3rd party) + endpoint: 'http://otel-collector-collector.observability-hub.svc.cluster.local:4318' + tls: + insecure: true + processors: {} + receivers: + prometheus: + config: + scrape_configs: + - job_name: vllm-sidecar + scrape_interval: 5s + static_configs: + - targets: + - 'localhost:8000' + service: + pipelines: + metrics: + exporters: + - debug + - otlphttp + receivers: + - prometheus + telemetry: + metrics: + address: '0.0.0.0:8888' + mode: sidecar + resources: {} + podDnsConfig: {} + managementState: managed + upgradeStrategy: automatic + ingress: + route: {} + daemonSetUpdateStrategy: {} + targetAllocator: + allocationStrategy: consistent-hashing + filterStrategy: relabel-config + observability: + metrics: {} + prometheusCR: + scrapeInterval: 30s + resources: {} + replicas: 1 + ipFamilyPolicy: SingleStack diff --git a/kubernetes/observability/otel-collector/otel-collector.yaml b/kubernetes/observability/otel-collector/otel-collector.yaml new file mode 100644 index 000000000..013d02747 --- /dev/null +++ b/kubernetes/observability/otel-collector/otel-collector.yaml @@ -0,0 +1,129 @@ +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otel-collector +spec: + serviceAccount: otel-collector + config: + extensions: + bearertokenauth: + filename: "/var/run/secrets/kubernetes.io/serviceaccount/token" + + exporters: + debug: + verbosity: basic + #otlphttp/dynatrace: + # update endpoint and Api-Token before deploying + #endpoint: "https://XXXXXXX.live.dynatrace.com/api/v2/otlp" + #headers: + #Authorization: "Api-Token dxxxxxx.XXXXXXXXXXXXXXX" + # Export the dev tenant traces to a Tempo instance + otlphttp/dev: + endpoint: https://tempo-tempostack-gateway.observability-hub.svc.cluster.local:8080/api/traces/v1/dev + tls: + insecure: false + ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt" + auth: + authenticator: bearertokenauth + headers: + X-Scope-OrgID: "dev" + # cluster user-workload monitoring prometheus backend + #prometheus/ocp-uwm: + # add_metric_suffixes: false + # endpoint: 0.0.0.0:8889 + # metric_expiration: 180m + # resource_to_telemetry_conversion: + # enabled: true + + receivers: + prometheus: + config: + scrape_configs: + # service/vllm in ns/llama-serve + # add any service.ns.svc.cluster.local:port that includes a /metrics endpoint + # If you use otel-sidecars in each deployment, you do not need to list them here. + - job_name: vllm-llama-serve + scrape_interval: 15s + static_configs: + - targets: + - 'vllm.llama-serve.svc.cluster.local:8000' + # service/safety in ns/llama-serve + - job_name: vllm-safety-serve + scrape_interval: 15s + static_configs: + - targets: + - 'safety.llama-serve.svc.cluster.local:8000' + otlp: + protocols: + grpc: {} + #endpoint: 0.0.0.0:4317 + #tls: + # cert_file: /certs/server.crt + # client_ca_file: /certs/ca.crt + # key_file: /certs/server.key + http: {} + #endpoint: 0.0.0.0:4318 + #tls: + # cert_file: /certs/server.crt + # client_ca_file: /certs/ca.crt + # key_file: /certs/server.key + + processors: + batch: + send_batch_size: 100 + timeout: 1s + # cumulativetodelta necessary to export to dynatrace + # Dynatrace only accepts delta metrics + # OCP user-workload-monitoring only accepts cumulative metrics + #cumulativetodelta: {} + memory_limiter: + check_interval: 5s + limit_percentage: 95 + spike_limit_percentage: 25 + + service: + extensions: + - bearertokenauth + pipelines: + metrics: + exporters: + - debug + #- prometheus/ocp-uwm + #- otlphttp/dynatrace + receivers: + - otlp + - prometheus + processors: + #- cumulativetodelta + - batch + - memory_limiter + traces: + exporters: + - debug + - otlphttp/dev + #- otlphttp/dynatrace + receivers: + - otlp + processors: + - batch + - memory_limiter + telemetry: + metrics: + address: 0.0.0.0:8888 + ingress: + route: + termination: passthrough + type: route + mode: deployment + observability: + metrics: + enableMetrics: true + upgradeStrategy: automatic + #volumeMounts: + #- mountPath: /certs + # name: mtls-certs + #volumes: + #- secret: + # secretName: mtls-certs + # name: mtls-certs diff --git a/kubernetes/observability/otel-collector/sa.yaml b/kubernetes/observability/otel-collector/sa.yaml new file mode 100644 index 000000000..d820a7ded --- /dev/null +++ b/kubernetes/observability/otel-collector/sa.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + diff --git a/kubernetes/observability/podmonitor-example-0.yaml b/kubernetes/observability/podmonitor-example-0.yaml new file mode 100644 index 000000000..eefc64d49 --- /dev/null +++ b/kubernetes/observability/podmonitor-example-0.yaml @@ -0,0 +1,20 @@ +apiVersion: monitoring.rhobs/v1 +kind: PodMonitor +metadata: + name: vllm-llama-serve-monitor +spec: + namespaceSelector: {} + podMetricsEndpoints: + - bearerTokenSecret: + key: "" + interval: 30s + path: /metrics + selector: + matchExpressions: + - key: app + operator: In + values: + - safety + - llama32-3b + - granite-8b + - llama31-70b diff --git a/kubernetes/observability/podmonitor-example-1.yaml b/kubernetes/observability/podmonitor-example-1.yaml new file mode 100644 index 000000000..8e4228d6f --- /dev/null +++ b/kubernetes/observability/podmonitor-example-1.yaml @@ -0,0 +1,15 @@ +apiVersion: monitoring.rhobs/v1 +kind: PodMonitor +metadata: + name: vllm-llama-serve-monitor + namespace: llama-serve +spec: + namespaceSelector: {} + podMetricsEndpoints: + - bearerTokenSecret: + key: '' + interval: 30s + path: /metrics + selector: + matchLabels: + app: vllm # Must match the pod labels diff --git a/kubernetes/observability/servicemonitor-example.yaml b/kubernetes/observability/servicemonitor-example.yaml new file mode 100644 index 000000000..1156857a2 --- /dev/null +++ b/kubernetes/observability/servicemonitor-example.yaml @@ -0,0 +1,13 @@ +apiVersion: monitoring.rhobs/v1 +kind: ServiceMonitor +metadata: + name: vllm-llama-serve + namespace: llama-serve +spec: + selector: + matchLabels: + app: vllm # Must match the Service labels + endpoints: + - port: "8000" # Must match the Service port name + path: /metrics # Path to your metrics endpoint + interval: 5s diff --git a/kubernetes/observability/tempo/kustomization.yaml b/kubernetes/observability/tempo/kustomization.yaml new file mode 100644 index 000000000..b72d8ca47 --- /dev/null +++ b/kubernetes/observability/tempo/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: observability-hub +resources: +- tempo-role.yaml +- minio-secret-tempo.yaml +- minio-tempo-pvc.yaml +- minio-tempo-svc.yaml +- minio-tempo.yaml +- tempo-multitenant.yaml diff --git a/kubernetes/observability/tempo/minio-secret-tempo.yaml b/kubernetes/observability/tempo/minio-secret-tempo.yaml new file mode 100644 index 000000000..9d66e66db --- /dev/null +++ b/kubernetes/observability/tempo/minio-secret-tempo.yaml @@ -0,0 +1,11 @@ +kind: Secret +apiVersion: v1 +metadata: + name: minio-tempo +# TEST VALUES ONLY USED IN DEV TESTING +stringData: + access_key_id: tempo # notsecret + access_key_secret: supersecret # notsecret + bucket: tempo # notsecret + endpoint: http://minio-tempo.observability.svc:9000 # notsecret +type: Opaque diff --git a/kubernetes/observability/tempo/minio-tempo-pvc.yaml b/kubernetes/observability/tempo/minio-tempo-pvc.yaml new file mode 100644 index 000000000..fa53f6e5e --- /dev/null +++ b/kubernetes/observability/tempo/minio-tempo-pvc.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + finalizers: + - kubernetes.io/pvc-protection + labels: + app.kubernetes.io/name: minio-tempo + name: minio-tempo +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 12Gi + volumeMode: Filesystem + diff --git a/kubernetes/observability/tempo/minio-tempo-svc.yaml b/kubernetes/observability/tempo/minio-tempo-svc.yaml new file mode 100644 index 000000000..c75e95b8c --- /dev/null +++ b/kubernetes/observability/tempo/minio-tempo-svc.yaml @@ -0,0 +1,13 @@ +kind: Service +apiVersion: v1 +metadata: + name: minio-tempo +spec: + ports: + - protocol: TCP + port: 9000 + targetPort: 9000 + internalTrafficPolicy: Cluster + type: ClusterIP + selector: + app.kubernetes.io/name: minio-tempo diff --git a/kubernetes/observability/tempo/minio-tempo.yaml b/kubernetes/observability/tempo/minio-tempo.yaml new file mode 100644 index 000000000..9f7449c1b --- /dev/null +++ b/kubernetes/observability/tempo/minio-tempo.yaml @@ -0,0 +1,54 @@ +kind: Deployment +apiVersion: apps/v1 +metadata: + name: minio-tempo +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: minio-tempo + template: + metadata: + labels: + app.kubernetes.io/name: minio-tempo + spec: + volumes: + - name: storage + persistentVolumeClaim: + claimName: minio-tempo + containers: + - resources: {} + name: minio-tempo + command: + - /bin/sh + - '-c' + - | + mkdir -p /storage/tempo && \ + minio server /storage + env: + # TEST VALUES ONLY USED IN TEST DEV ENV + - name: MINIO_ROOT_USER + valueFrom: + secretKeyRef: + name: minio-user-creds + key: MINIO_ROOT_USER + - name: MINIO_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: minio-user-creds + key: MINIO_ROOT_PASSWORD + ports: + - containerPort: 9000 + protocol: TCP + imagePullPolicy: Always + volumeMounts: + - name: storage + mountPath: /storage + image: quay.io/minio/minio + restartPolicy: Always + dnsPolicy: ClusterFirst + securityContext: {} + strategy: + type: Recreate + progressDeadlineSeconds: 600 + diff --git a/kubernetes/observability/tempo/tempo-multitenant.yaml b/kubernetes/observability/tempo/tempo-multitenant.yaml new file mode 100644 index 000000000..ebcf8663f --- /dev/null +++ b/kubernetes/observability/tempo/tempo-multitenant.yaml @@ -0,0 +1,27 @@ +# based on config/samples/openshift/tempo_v1alpha1_multitenancy.yaml +apiVersion: tempo.grafana.com/v1alpha1 +kind: TempoStack +metadata: + name: tempostack +spec: + storage: + secret: + name: minio-tempo + type: s3 + storageSize: 15Gi + resources: + total: + limits: + memory: 10Gi + cpu: 5000m + tenants: + mode: openshift + authentication: + - tenantName: dev + tenantId: "1610b0c3-c509-4592-a256-a1871353dbfa" + template: + gateway: + enabled: true + queryFrontend: + jaegerQuery: + enabled: true diff --git a/kubernetes/observability/tempo/tempo-role.yaml b/kubernetes/observability/tempo/tempo-role.yaml new file mode 100644 index 000000000..be1841d8e --- /dev/null +++ b/kubernetes/observability/tempo/tempo-role.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tempostack-traces-reader +rules: + - apiGroups: + - 'tempo.grafana.com' + resources: + - dev + resourceNames: + - traces + verbs: + - 'get' +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tempostack-traces-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tempostack-traces-reader +subjects: + - kind: Group + apiGroup: rbac.authorization.k8s.io + name: system:authenticated + diff --git a/kubernetes/observability/tracing-ui-plugin.yaml b/kubernetes/observability/tracing-ui-plugin.yaml new file mode 100644 index 000000000..9791b61c3 --- /dev/null +++ b/kubernetes/observability/tracing-ui-plugin.yaml @@ -0,0 +1,8 @@ +apiVersion: observability.openshift.io/v1alpha1 +kind: UIPlugin +metadata: + # It will not work with any other name + name: distributed-tracing +spec: + type: DistributedTracing + From 352fcabd21386c5f168c16e3d35876a0bc4a907a Mon Sep 17 00:00:00 2001 From: sallyom Date: Fri, 4 Apr 2025 13:41:52 -0400 Subject: [PATCH 2/5] add otel configs in llamastack manifests Signed-off-by: sallyom --- kubernetes/observability/README.md | 47 ++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/kubernetes/observability/README.md b/kubernetes/observability/README.md index 0ffac12d3..b30ed679b 100644 --- a/kubernetes/observability/README.md +++ b/kubernetes/observability/README.md @@ -2,6 +2,53 @@ Follow this README to configure an observability stack in OpenShift to visualize Llamastack telemetry and vLLM metrics. +## Generate telemetry from Llamastack and vLLM + +### vLLM + +For vLLM, metrics are generated by default and are exposed at `vllm-endpoint:port/metrics`. For a list of metrics, +you can `curl localhost:8000/metrics` from within a vLLM container. + +### Llamastack + +With Llamastack, you need to specify in the run-config.yaml to enable telemetry collection with an opentelemetry receiver. +Here's how to do that: + +#### Updated manifests for telemetry trace collection with opentelemetry receiver endpoint + +This is for traces only. There is a similar `otel_metric` sink and `otel_metric_endpoint`, however, there are currently +only 4 metrics generated within Llamastack, and these are duplicates of what vLLM provides. + +[kubernetes/llama-stack/configmap.yaml](../llama-stack/configmap.yaml) + +```yaml +--- + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console, otel_trace, sqlite} <-add otel_trace and/or otel_metric + otel_trace_endpoint: ${env.OTEL_TRACE_ENDPOINT:} <-add ONLY if opentelemetry receiver endpoint is available. +--- +``` +And, in [kubernetes/llama-stack/deployment.yaml](../llama-stack/deployment.yaml) + +```yaml +--- + env: + - name: OTEL_SERVICE_NAME + value: llamastack + - name: OTEL_TRACE_ENDPOINT + value: http://otel-collector-collector.observability-hub.svc.cluster.local:4318/v1/traces + #- name: OTEL_METRIC_ENDPOINT + #- value: http://otel-collector-collector.observability-hub.svc.cluster.local:4318/v1/metrics +--- +``` + +The otel-endpoint is `http://service-name-otc.namespace-of-otc.svc.cluster.local:4318/v1/traces,metrics` if exporting to +central otel-collector. If using otel-collector sidecar, this would be `http://localhost:4318/v1/traces,metrics`. + ## OpenShift Observability Operators Operators are available from OperatorHub From 5e45f76fd5aa06ca976d9f95943c9018f227a567 Mon Sep 17 00:00:00 2001 From: sallyom Date: Mon, 7 Apr 2025 13:59:44 -0400 Subject: [PATCH 3/5] update to add vLLM tracing guide Signed-off-by: sallyom --- kubernetes/observability/README.md | 82 ++++---------- .../otel-collector-vllm-sidecar.yaml | 13 ++- .../otel-collector/otel-collector.yaml | 7 -- kubernetes/observability/run-configuration.md | 107 ++++++++++++++++++ kubernetes/observability/vllm-Containerfile | 9 ++ 5 files changed, 152 insertions(+), 66 deletions(-) create mode 100644 kubernetes/observability/run-configuration.md create mode 100644 kubernetes/observability/vllm-Containerfile diff --git a/kubernetes/observability/README.md b/kubernetes/observability/README.md index b30ed679b..5f51f60e9 100644 --- a/kubernetes/observability/README.md +++ b/kubernetes/observability/README.md @@ -1,53 +1,8 @@ # Monitor Llamastack & vLLM in OpenShift Follow this README to configure an observability stack in OpenShift to visualize Llamastack telemetry and vLLM metrics. +First, ensure Llamastack and vLLM are configured to generate telemetry by following this [configuration guide](./run-configuration.md) -## Generate telemetry from Llamastack and vLLM - -### vLLM - -For vLLM, metrics are generated by default and are exposed at `vllm-endpoint:port/metrics`. For a list of metrics, -you can `curl localhost:8000/metrics` from within a vLLM container. - -### Llamastack - -With Llamastack, you need to specify in the run-config.yaml to enable telemetry collection with an opentelemetry receiver. -Here's how to do that: - -#### Updated manifests for telemetry trace collection with opentelemetry receiver endpoint - -This is for traces only. There is a similar `otel_metric` sink and `otel_metric_endpoint`, however, there are currently -only 4 metrics generated within Llamastack, and these are duplicates of what vLLM provides. - -[kubernetes/llama-stack/configmap.yaml](../llama-stack/configmap.yaml) - -```yaml ---- - telemetry: - - provider_id: meta-reference - provider_type: inline::meta-reference - config: - service_name: ${env.OTEL_SERVICE_NAME:llama-stack} - sinks: ${env.TELEMETRY_SINKS:console, otel_trace, sqlite} <-add otel_trace and/or otel_metric - otel_trace_endpoint: ${env.OTEL_TRACE_ENDPOINT:} <-add ONLY if opentelemetry receiver endpoint is available. ---- -``` -And, in [kubernetes/llama-stack/deployment.yaml](../llama-stack/deployment.yaml) - -```yaml ---- - env: - - name: OTEL_SERVICE_NAME - value: llamastack - - name: OTEL_TRACE_ENDPOINT - value: http://otel-collector-collector.observability-hub.svc.cluster.local:4318/v1/traces - #- name: OTEL_METRIC_ENDPOINT - #- value: http://otel-collector-collector.observability-hub.svc.cluster.local:4318/v1/metrics ---- -``` - -The otel-endpoint is `http://service-name-otc.namespace-of-otc.svc.cluster.local:4318/v1/traces,metrics` if exporting to -central otel-collector. If using otel-collector sidecar, this would be `http://localhost:4318/v1/traces,metrics`. ## OpenShift Observability Operators @@ -88,6 +43,12 @@ oc create ns observability-hub ### Tracing Backend (Tempo with Minio for S3 storage) +In order to view distributed tracing data from LLamastack and/or vLLM, you must deploy a tracing backend. The supported tracing backend in OpenShift +is Tempo. See the OpenShift Tempo +[documentation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/distributed_tracing/distributed-tracing-platform-tempo#distr-tracing-tempo-install-tempostack-web-console_dist-tracing-tempo-installing) +for further details. Tempo must be paired with a storage solution. For this example, `MinIO` is used. The necessary resources can be created by +applying the `./tempo` manifests. + ```bash # edit storageclassName & secret as necessary # secret and storage for testing only @@ -97,7 +58,7 @@ oc apply --kustomize ./tempo -n observability-hub ### OpenTelemetryCollector deployment OpenTelemetry Collector is used to aggregate telemetry from various workloads, process individual signals, and export -to various backends. This is used to collect traces from various workloads and export all as a single +to various backends. This example will collect traces from various workloads and export all as a single authenticated stream to the in-cluster TempoStack. For in-cluster only, opentelemetry-collector is not necessary to collect metrics. Metrics are sent to the in-cluster user-workload-monitoring prometheus by creating the podmonitors and servicemonitors. However, if exporting off-cluster to a 3rd party observability vendor, the collector is necessary for all signals, @@ -134,9 +95,24 @@ oc patch deployment \ -p '{"spec":{"template":{"metadata":{"annotations":{"sidecar.opentelemetry.io/inject":"vllm-otelsidecar"}}}}}' ``` +### Cluster Observability Operator Tracing UIPlugin + +The Jaeger frontend feature of TempoStack is no longer supported by Red Hat. This has been replaced by the COO UIPlugin. To create the UIPlugin for +Tracing, first ensure the TempoStack described above is created. This is a prerequisite. Then, all that's necessary to view traces from +the OpenShift console at `Observe -> Traces` is to create the following [Tracing UIPlugin resource](./tracing-ui-plugin.yaml). + +```bash +oc apply ./tracing-ui-plugin.yaml +``` + +You should now see traces and metrics in the OpenShift console, from the `Oberve` tab. + ### Grafana -This will deploy a Grafana instance, and Prometheus & Tempo DataSources +Most users are familiar with Grafana for visualizing and analyzing telemetry. To create the Grafana resources necessary to view +Llamastack and vLLM telemetry, follow the below example. + +This example will deploy a Grafana instance, and Prometheus & Tempo DataSources The prometheus datasource is the user-workload-monitoring prometheus running in `openshift-user-workload-monitoring` namespace. The Grafana console is configured with `username: rhel, password: rhel` @@ -157,13 +133,3 @@ The dashboard is slightly modified from https://github.com/kevchu3/openshift4-gr ```bash oc apply -n observability-hub -f cluster-metrics-dashboard/cluster-metrics.yaml ``` - -### Cluster Observability Operator Tracing UIPlugin - -The Jaeger frontend feature of TempoStack is no longer supported by Red Hat. This has been replaced by the COO UIPlugin. To create the UIPlugin for -Tracing, first ensure the TempoStack described above is created. This is a prerequisite. Then, all that's necessary to view traces from -the OpenShift console at `Observe -> Traces` is to create the following [Tracing UIPlugin resource](./tracing-ui-plugin.yaml). - -```bash -oc apply ./tracing-ui-plugin.yaml -``` diff --git a/kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml b/kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml index d23ae8cc5..10750f632 100644 --- a/kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml +++ b/kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml @@ -21,22 +21,33 @@ spec: insecure: true processors: {} receivers: + otlp: + protocols: + grpc: {} + http: {} prometheus: config: scrape_configs: - job_name: vllm-sidecar - scrape_interval: 5s + scrape_interval: 15s static_configs: - targets: - 'localhost:8000' service: pipelines: + traces: + exporters: + - debug + - otlphttp + receivers: + - otlp metrics: exporters: - debug - otlphttp receivers: - prometheus + - otlp telemetry: metrics: address: '0.0.0.0:8888' diff --git a/kubernetes/observability/otel-collector/otel-collector.yaml b/kubernetes/observability/otel-collector/otel-collector.yaml index 013d02747..b91fa415b 100644 --- a/kubernetes/observability/otel-collector/otel-collector.yaml +++ b/kubernetes/observability/otel-collector/otel-collector.yaml @@ -28,13 +28,6 @@ spec: authenticator: bearertokenauth headers: X-Scope-OrgID: "dev" - # cluster user-workload monitoring prometheus backend - #prometheus/ocp-uwm: - # add_metric_suffixes: false - # endpoint: 0.0.0.0:8889 - # metric_expiration: 180m - # resource_to_telemetry_conversion: - # enabled: true receivers: prometheus: diff --git a/kubernetes/observability/run-configuration.md b/kubernetes/observability/run-configuration.md new file mode 100644 index 000000000..662fdda7d --- /dev/null +++ b/kubernetes/observability/run-configuration.md @@ -0,0 +1,107 @@ +## Generate telemetry from Llamastack and vLLM + +### vLLM + +#### metrics + +For vLLM, metrics are generated by default and are exposed at `vllm-endpoint:port/metrics`. For a list of metrics, +you can `curl localhost:8000/metrics` from within a vLLM container. + +#### traces + +It's possible to generate vLLM distributed trace data by updating the vLLM image and start command. This [Containerfile](./vllm-Containerfile) +shows the necessary packages to generate vLLM traces. + +Here is how you would build vLLM with the tracing packages: + +```bash +podman build --platform x86_64 -t quay.io/[your-quay-username]/vllm:otlp-tracing -f vllm-Containerfile . +podman push quay.io/[your-quay-username]/vllm:otlp-tracing +``` + +Then, add the following updates to the vLLM deployment.yaml. We'll use the [granite-8b deployment](../llama-serve/granite-8b/vllm.yaml): +This example assumes there is an OpenTelemetryCollector with sidecar mode in the same namespace. +See [OpenTelemetryCollector Sidecars Deployment](./README.md#opentelemetrycollector_sidecars_deployment) + + +```yaml +--- + template: + metadata: + labels: + app: granite-8b + annotations: + sidecar.opentelemetry.io/inject: vllm-otelsidecar + spec: + containers: + - args: + - --model + - ibm-granite/granite-3.2-8b-instruct + - --max-model-len + - "128000" + - --enable-auto-tool-choice + - --chat-template + - /app/tool_chat_template_granite.jinja + - --tool-call-parser=granite + - --otlp-traces-endpoint + - 127.0.0.1:4317 + - --collect-detailed-traces + - "all" + - --port + - "8000" + image: 'quay.io/sallyom/vllm:otlp-tracing' + env: + - name: OTEL_SERVICE_NAME + value: "vllm-granite8b" + - name: OTEL_EXPORTER_OTLP_TRACES_INSECURE + value: "true" +--- +``` + +With the updated vLLM image and the updated deployment, distributed trace data will be generated and collected by the opentelemetry-collector +sidecar container and exported to the central observability-hub as outlined in the [README.md](./README.md) with a `TempoStack` as a tracing backend. +There is a performance impact with enabling tracing with vLLM, so it's recommended to update the deployment to enable tracing only when debugging to +avoid the performance impact. A complete list of vLLM engine arguments can be found [here](https://docs.vllm.ai/en/latest/serving/engine_args.html). + +### Llamastack + +With Llamastack, you need to specify in the run-config.yaml to enable telemetry collection with an opentelemetry receiver. +Here's how to do that: + +#### Updated manifests for telemetry trace collection with opentelemetry receiver endpoint + +This is for traces only. There is a similar `otel_metric` sink and `otel_metric_endpoint`, however, there are currently +only 4 metrics generated within Llamastack, and these are duplicates of what vLLM provides. + +[kubernetes/llama-stack/configmap.yaml](../llama-stack/configmap.yaml) + +```yaml +--- + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console, otel_trace, sqlite} <-add otel_trace and/or otel_metric + otel_trace_endpoint: ${env.OTEL_TRACE_ENDPOINT:} <-add ONLY if opentelemetry receiver endpoint is available. +--- +``` +And, in [kubernetes/llama-stack/deployment.yaml](../llama-stack/deployment.yaml) + +```yaml +--- + env: + - name: OTEL_SERVICE_NAME + value: llamastack + - name: OTEL_TRACE_ENDPOINT + value: http://otel-collector-collector.observability-hub.svc.cluster.local:4318/v1/traces + #- name: OTEL_METRIC_ENDPOINT + #- value: http://otel-collector-collector.observability-hub.svc.cluster.local:4318/v1/metrics +--- +``` + +The otel-endpoint is `http://service-name-otc.namespace-of-otc.svc.cluster.local:4318/v1/traces,metrics` if exporting to +central otel-collector. If using otel-collector sidecar, this would be `http://localhost:4318/v1/traces,metrics`. + +Now that vLLM and Llamastack are configured to generate and export telemetry, follow the [observability-hub guide](./README.md) to view and analyze +the data. diff --git a/kubernetes/observability/vllm-Containerfile b/kubernetes/observability/vllm-Containerfile new file mode 100644 index 000000000..3bce23fdd --- /dev/null +++ b/kubernetes/observability/vllm-Containerfile @@ -0,0 +1,9 @@ +# Use the vllm-openai image as the base +FROM docker.io/vllm/vllm-openai:v0.7.3 + +# Install OpenTelemetry packages +RUN pip install \ + "opentelemetry-sdk>=1.26.0,<1.27.0" \ + "opentelemetry-api>=1.26.0,<1.27.0" \ + "opentelemetry-exporter-otlp>=1.26.0,<1.27.0" \ + "opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0" From 02d9e71f96eee8f8db4d990a8af58cb094cad1b6 Mon Sep 17 00:00:00 2001 From: sallyom Date: Thu, 10 Apr 2025 13:18:24 -0400 Subject: [PATCH 4/5] update grafana deployments & add llamastack-sidecar Signed-off-by: sallyom --- kubernetes/observability/README.md | 4 +- .../observability/grafana/deploy-grafana.sh | 18 ------ ...fana-datasources.yaml => datasources.yaml} | 2 +- ...na-instance.yaml => grafana-instance.yaml} | 0 ...na-serviceaccount.yaml => grafana-sa.yaml} | 0 .../kustomization.yaml | 12 ++++ .../instance-with-prom-tempo-ds/role.yaml | 58 +++++++++++++++++++ .../{03-grafana-route.yaml => route.yaml} | 0 ...token-secret.yaml => sa-token-secret.yaml} | 0 .../otel-collector-llamastack-sidecar.yaml | 55 ++++++++++++++++++ kubernetes/observability/run-configuration.md | 31 +++++++--- .../tempo/minio-secret-tempo.yaml | 2 +- 12 files changed, 152 insertions(+), 30 deletions(-) delete mode 100755 kubernetes/observability/grafana/deploy-grafana.sh rename kubernetes/observability/grafana/instance-with-prom-tempo-ds/{04-grafana-datasources.yaml => datasources.yaml} (92%) rename kubernetes/observability/grafana/instance-with-prom-tempo-ds/{02-grafana-instance.yaml => grafana-instance.yaml} (100%) rename kubernetes/observability/grafana/instance-with-prom-tempo-ds/{02-grafana-serviceaccount.yaml => grafana-sa.yaml} (100%) create mode 100644 kubernetes/observability/grafana/instance-with-prom-tempo-ds/kustomization.yaml create mode 100644 kubernetes/observability/grafana/instance-with-prom-tempo-ds/role.yaml rename kubernetes/observability/grafana/instance-with-prom-tempo-ds/{03-grafana-route.yaml => route.yaml} (100%) rename kubernetes/observability/grafana/instance-with-prom-tempo-ds/{02-grafana-sa-token-secret.yaml => sa-token-secret.yaml} (100%) create mode 100644 kubernetes/observability/otel-collector/otel-collector-llamastack-sidecar.yaml diff --git a/kubernetes/observability/README.md b/kubernetes/observability/README.md index 5f51f60e9..fbead1672 100644 --- a/kubernetes/observability/README.md +++ b/kubernetes/observability/README.md @@ -117,9 +117,9 @@ The prometheus datasource is the user-workload-monitoring prometheus running in The Grafana console is configured with `username: rhel, password: rhel` ```bash -cd grafana -./deploy-grafana.sh +oc apply -k ./grafana/instance-with-prom-tempo-ds ``` + Upon success, you can explore metrics and traces from Grafana route. #### GrafanaDashboard to visualize cluster metrics and traces diff --git a/kubernetes/observability/grafana/deploy-grafana.sh b/kubernetes/observability/grafana/deploy-grafana.sh deleted file mode 100755 index d383dcb34..000000000 --- a/kubernetes/observability/grafana/deploy-grafana.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh - -# This will fail if the GrafanaOperator is not installed -# TODO: replace this with kustomize script & add the clusterrole & rolebinding yamls - -MONITORING_NS=observability-hub -SECRET=grafana-sa-token - -oc apply -f $(pwd)/instance-with-prom-tempo-ds/02-grafana-serviceaccount.yaml -n $MONITORING_NS -oc apply -f $(pwd)/instance-with-prom-tempo-ds/02-grafana-sa-token-secret.yaml -n $MONITORING_NS -oc apply -f $(pwd)/instance-with-prom-tempo-ds/02-grafana-instance.yaml -n $MONITORING_NS -oc apply -f $(pwd)/instance-with-prom-tempo-ds/03-grafana-route.yaml -n $MONITORING_NS -oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana-sa -oc adm policy add-cluster-role-to-user openshift-cluster-monitoring-view -z grafana-sa -oc adm policy add-cluster-role-to-user tempostack-traces-reader -z grafana-sa -oc adm policy add-role-to-user edit -z grafana-sa -n $MONITORING_NS -oc apply -f instance-with-prom-tempo-ds/04-grafana-datasources.yaml -n $MONITORING_NS - diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/04-grafana-datasources.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/datasources.yaml similarity index 92% rename from kubernetes/observability/grafana/instance-with-prom-tempo-ds/04-grafana-datasources.yaml rename to kubernetes/observability/grafana/instance-with-prom-tempo-ds/datasources.yaml index 04832f87e..1ae6defb2 100644 --- a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/04-grafana-datasources.yaml +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/datasources.yaml @@ -40,7 +40,7 @@ spec: editable: true type: tempo # This is specific to "observability-hub" namespace. If running tempostack elsewhere, need to update - url: "https://tempo-tempostack-gateway-observability-hub.apps.ocp-beta-test.nerc.mghpcc.org/api/traces/v1/dev/tempo" + url: "https://tempo-tempostack-gateway.observability-hub.svc.cluster.local:8081/api/traces/v1/dev/tempo" isDefault: false secureJsonData: "httpHeaderValue1": "Bearer ${token}" diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-instance.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/grafana-instance.yaml similarity index 100% rename from kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-instance.yaml rename to kubernetes/observability/grafana/instance-with-prom-tempo-ds/grafana-instance.yaml diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-serviceaccount.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/grafana-sa.yaml similarity index 100% rename from kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-serviceaccount.yaml rename to kubernetes/observability/grafana/instance-with-prom-tempo-ds/grafana-sa.yaml diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/kustomization.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/kustomization.yaml new file mode 100644 index 000000000..fd291ae32 --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: observability-hub +resources: +- grafana-instance.yaml +- role.yaml +- sa-token-secret.yaml +- grafana-sa.yaml +- route.yaml +- datasources.yaml + diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/role.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/role.yaml new file mode 100644 index 000000000..efcb9dc8e --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/role.yaml @@ -0,0 +1,58 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-monitoring-view-grafana +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-monitoring-view +subjects: + - kind: ServiceAccount + name: grafana-sa + # update if not using observability-hub namespace + namespace: observability-hub +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: openshift-cluster-monitoring-view-grafana +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: openshift-cluster-monitoring-view +subjects: + - kind: ServiceAccount + name: grafana-sa + # update if not using observability-hub namespace + namespace: observability-hub +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tempostack-traces-reader-grafana +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tempostack-traces-reader +subjects: + - kind: ServiceAccount + name: grafana-sa + # update if not using observability-hub namespace + namespace: observability-hub +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: edit + # update if not using observability-hub namespace + namespace: observability-hub +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: edit +subjects: +- kind: ServiceAccount + name: grafana-sa + # update if not using observability-hub namespace + namespace: observability-hub diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/03-grafana-route.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/route.yaml similarity index 100% rename from kubernetes/observability/grafana/instance-with-prom-tempo-ds/03-grafana-route.yaml rename to kubernetes/observability/grafana/instance-with-prom-tempo-ds/route.yaml diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-sa-token-secret.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/sa-token-secret.yaml similarity index 100% rename from kubernetes/observability/grafana/instance-with-prom-tempo-ds/02-grafana-sa-token-secret.yaml rename to kubernetes/observability/grafana/instance-with-prom-tempo-ds/sa-token-secret.yaml diff --git a/kubernetes/observability/otel-collector/otel-collector-llamastack-sidecar.yaml b/kubernetes/observability/otel-collector/otel-collector-llamastack-sidecar.yaml new file mode 100644 index 000000000..3426c24d9 --- /dev/null +++ b/kubernetes/observability/otel-collector/otel-collector-llamastack-sidecar.yaml @@ -0,0 +1,55 @@ +# Once this exists, any pod with the template.metadata.annotation below will send metrics +# to observability-hub: +# sidecar.opentelemetry.io/inject: llamastack-otelsidecar +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: llamastack-otelsidecar +spec: + observability: + metrics: {} + deploymentUpdateStrategy: {} + config: + exporters: + debug: {} + otlphttp: + # all sidecars can export to the central observability-hub otel-collector, then be + # exported to various backends from there (in-cluster, external 3rd party) + endpoint: 'http://otel-collector-collector.observability-hub.svc.cluster.local:4318' + tls: + insecure: true + processors: {} + receivers: + otlp: + protocols: + grpc: {} + http: {} + service: + pipelines: + traces: + exporters: + - debug + - otlphttp + receivers: + - otlp + telemetry: + metrics: + address: '0.0.0.0:8888' + mode: sidecar + resources: {} + podDnsConfig: {} + managementState: managed + upgradeStrategy: automatic + ingress: + route: {} + daemonSetUpdateStrategy: {} + targetAllocator: + allocationStrategy: consistent-hashing + filterStrategy: relabel-config + observability: + metrics: {} + prometheusCR: + scrapeInterval: 30s + resources: {} + replicas: 1 + ipFamilyPolicy: SingleStack diff --git a/kubernetes/observability/run-configuration.md b/kubernetes/observability/run-configuration.md index 662fdda7d..6ae6d92c0 100644 --- a/kubernetes/observability/run-configuration.md +++ b/kubernetes/observability/run-configuration.md @@ -10,12 +10,13 @@ you can `curl localhost:8000/metrics` from within a vLLM container. #### traces It's possible to generate vLLM distributed trace data by updating the vLLM image and start command. This [Containerfile](./vllm-Containerfile) -shows the necessary packages to generate vLLM traces. +shows the necessary packages to generate vLLM traces. In the future, these packages may be added to the default vLLM image available from +Red Hat OpenShift AI. Here is how you would build vLLM with the tracing packages: ```bash -podman build --platform x86_64 -t quay.io/[your-quay-username]/vllm:otlp-tracing -f vllm-Containerfile . +podman build --platform x86_64 -t quay.io//vllm:otlp-tracing -f vllm-Containerfile . podman push quay.io/[your-quay-username]/vllm:otlp-tracing ``` @@ -49,7 +50,7 @@ See [OpenTelemetryCollector Sidecars Deployment](./README.md#opentelemetrycollec - "all" - --port - "8000" - image: 'quay.io/sallyom/vllm:otlp-tracing' + image: 'quay.io//vllm:otlp-tracing' env: - name: OTEL_SERVICE_NAME value: "vllm-granite8b" @@ -66,7 +67,9 @@ avoid the performance impact. A complete list of vLLM engine arguments can be fo ### Llamastack With Llamastack, you need to specify in the run-config.yaml to enable telemetry collection with an opentelemetry receiver. -Here's how to do that: +Don't update these until _after_ the [OpentelemetryCollector Sidecar](./otel-collector/otel-collector-llamastack-sidecar.yaml) +is deployed. Follow the [observability-hub guide](./README.md) +to install the `RH Build of OpenTelemetry Operator` and `OpenTelemetryCollector`. #### Updated manifests for telemetry trace collection with opentelemetry receiver endpoint @@ -89,19 +92,31 @@ only 4 metrics generated within Llamastack, and these are duplicates of what vLL And, in [kubernetes/llama-stack/deployment.yaml](../llama-stack/deployment.yaml) ```yaml +--- + template: + metadata: + labels: + app: llama-stack + annotations: + sidecar.opentelemetry.io/inject: llamastack-otelsidecar + spec: + containers: --- env: - name: OTEL_SERVICE_NAME value: llamastack - name: OTEL_TRACE_ENDPOINT - value: http://otel-collector-collector.observability-hub.svc.cluster.local:4318/v1/traces + value: http://localhost:4318/v1/traces #- name: OTEL_METRIC_ENDPOINT - #- value: http://otel-collector-collector.observability-hub.svc.cluster.local:4318/v1/metrics + #- value: http://localhost:4318/v1/metrics --- ``` The otel-endpoint is `http://service-name-otc.namespace-of-otc.svc.cluster.local:4318/v1/traces,metrics` if exporting to central otel-collector. If using otel-collector sidecar, this would be `http://localhost:4318/v1/traces,metrics`. -Now that vLLM and Llamastack are configured to generate and export telemetry, follow the [observability-hub guide](./README.md) to view and analyze -the data. +Don't update the Llamastack deployment until _after_ the [OpentelemetryCollector Sidecar](./otel-collector/otel-collector-llamastack-sidecar.yaml) +is deployed. + +Now that the configuration changes necessary to generate and export telemetry from vLLM and Llamastack, +follow the [observability-hub guide](./README.md) to view and analyze the data. diff --git a/kubernetes/observability/tempo/minio-secret-tempo.yaml b/kubernetes/observability/tempo/minio-secret-tempo.yaml index 9d66e66db..295a98518 100644 --- a/kubernetes/observability/tempo/minio-secret-tempo.yaml +++ b/kubernetes/observability/tempo/minio-secret-tempo.yaml @@ -7,5 +7,5 @@ stringData: access_key_id: tempo # notsecret access_key_secret: supersecret # notsecret bucket: tempo # notsecret - endpoint: http://minio-tempo.observability.svc:9000 # notsecret + endpoint: http://minio-tempo.observability-hub.svc.cluster.local:9000 # notsecret type: Opaque From 4cca9b03a41e265e2ee1de00b4fa38753b1b3451 Mon Sep 17 00:00:00 2001 From: sallyom Date: Thu, 10 Apr 2025 13:27:23 -0400 Subject: [PATCH 5/5] add otel sidecar to llamastack deployment Signed-off-by: sallyom --- kubernetes/llama-stack/deployment.yaml | 6 +- .../llama-stack/otel-collector-sidecar.yaml | 58 +++++++++++++++++++ kubernetes/observability/README.md | 35 +++++++---- 3 files changed, 85 insertions(+), 14 deletions(-) create mode 100644 kubernetes/llama-stack/otel-collector-sidecar.yaml diff --git a/kubernetes/llama-stack/deployment.yaml b/kubernetes/llama-stack/deployment.yaml index 2d2ae0f69..2691390db 100644 --- a/kubernetes/llama-stack/deployment.yaml +++ b/kubernetes/llama-stack/deployment.yaml @@ -8,8 +8,8 @@ spec: app: llamastack template: metadata: - #annotations: - # sidecar.opentelemetry.io/inject: otelsidecar + annotations: + sidecar.opentelemetry.io/inject: llamastack-otelsidecar labels: app: llamastack spec: @@ -47,7 +47,7 @@ spec: - name: OTEL_SERVICE_NAME value: om-llamastack - name: OTEL_TRACE_ENDPOINT - value: 'http://otel-collector-collector.observability-hub.svc.cluster.local:4318/v1/traces' + value: 'http://localhost:4318/v1/traces' - name: SAFETY_MODEL value: meta-llama/Llama-Guard-3-8B - name: SAFETY_VLLM_URL diff --git a/kubernetes/llama-stack/otel-collector-sidecar.yaml b/kubernetes/llama-stack/otel-collector-sidecar.yaml new file mode 100644 index 000000000..43230c43c --- /dev/null +++ b/kubernetes/llama-stack/otel-collector-sidecar.yaml @@ -0,0 +1,58 @@ +# Once this exists, any pod with the template.metadata.annotation below will send metrics +# to observability-hub: +# sidecar.opentelemetry.io/inject: llamastack-otelsidecar +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: llamastack-otelsidecar +spec: + observability: + metrics: {} + deploymentUpdateStrategy: {} + config: + exporters: + debug: {} + otlphttp: + # all sidecars export to the central observability-hub otel-collector, then be + # exported to various backends from there (in-cluster, external 3rd party) + # this is deployed with ../observability/otel-collector manifests + # see ../observability/README.md for how to deploy this collector + endpoint: 'http://otel-collector-collector.observability-hub.svc.cluster.local:4318' + tls: + insecure: true + processors: {} + receivers: + otlp: + protocols: + grpc: {} + http: {} + service: + pipelines: + traces: + exporters: + - debug + - otlphttp + receivers: + - otlp + telemetry: + metrics: + address: '0.0.0.0:8888' + mode: sidecar + resources: {} + podDnsConfig: {} + managementState: managed + upgradeStrategy: automatic + ingress: + route: {} + daemonSetUpdateStrategy: {} + targetAllocator: + allocationStrategy: consistent-hashing + filterStrategy: relabel-config + observability: + metrics: {} + prometheusCR: + scrapeInterval: 30s + resources: {} + replicas: 1 + ipFamilyPolicy: SingleStack + diff --git a/kubernetes/observability/README.md b/kubernetes/observability/README.md index fbead1672..6bd395f87 100644 --- a/kubernetes/observability/README.md +++ b/kubernetes/observability/README.md @@ -65,6 +65,8 @@ However, if exporting off-cluster to a 3rd party observability vendor, the colle and can provide a single place with which to receive telemetry from various workloads and export as a single authenticated and secure OTLP stream. +#### Central OpenTelemetry Collector + To create a central opentelemetry-collector, update the [otel-collector/otel-collector.yaml](./otel-collector/otel-collector.yaml) to match your requirements and then apply. @@ -72,27 +74,38 @@ To create a central opentelemetry-collector, update the oc apply --kustomize ./otel-collector -n observability-hub ``` -### OpenTelemetryCollector Sidecars deployment +#### OpenTelemetryCollector Sidecars deployment You can add individual metrics endpoints to the central otel-collector in observability-hub, but another way is to add otel-collector sidecar containers to individual deployments throughout the cluster. Paired with an annotation on the deployment, telemetry will be exported as configured. -Any deployment with the annotation below will receive and export telemetry as configured in the + +Any deployment with the template.metadata.annotations `sidecar.opentelemetry.io/inject: vllm-otelsidecar` +will receive and export telemetry as configured in the [otel-collector-vllm-sidecar.yaml](./otel-collector/otel-collector-vllm-sidecar.yaml). -The example here will add an otel-collector sidecar custom resource to the `llama-serve` namespace, -and to trigger a sidecar container, annotate any deployment's `template.metadata.annotations` with: -`sidecar.opentelemetry.io/inject: vllm-otelsidecar` +Any deployment with the template.metadata.annotations `sidecar.opentelemetry.io/inject: llamastack-otelsidecar` +will receive and export telemetry as configured in the +[otel-collector-llamstack-sidecar.yaml](./otel-collector/otel-collector-llamastack-sidecar.yaml). + +The example below will add otel-collector sidecar custom resources to the `llama-serve` namespace, +and upon a scale down, scale up of the deployments with the added annotations, sidecar otel-collector +containers will be added to the pods. ```bash -oc apply -f ./otel-collector/otel-collector-vllm-sidecar.yaml +oc apply -f ./otel-collector/otel-collector-vllm-sidecar.yaml -n llama-serve +oc apply -f ./otel-collector/otel-collector-llamastack-sidecar.yaml -n llama-serve + +# Then, annotate whatever deployment you'd like to collect telemetry from +# Add the annotation to the deployment's `template.metadata.annotations` from the console. +# OR +# Patch or modify the llamastack and vLLM deployments with the appropriate annotation. +# Replace `deployment-name`, `namespace`, and `name-of-otelsideccar` in the below command. -# Then, annotate whatever vllm deployment you'd like to collect metrics from -# Or, add the annotation to the deployment's `template.metadata.annotations` from the console. -oc patch deployment \ - -n \ +oc patch deployment deployment-name \ + -n namespace \ --type='merge' \ - -p '{"spec":{"template":{"metadata":{"annotations":{"sidecar.opentelemetry.io/inject":"vllm-otelsidecar"}}}}}' + -p '{"spec":{"template":{"metadata":{"annotations":{"sidecar.opentelemetry.io/inject":"name-of-otelsidecar"}}}}}' ``` ### Cluster Observability Operator Tracing UIPlugin