diff --git a/kubernetes/llama-stack/deployment.yaml b/kubernetes/llama-stack/deployment.yaml index 2d2ae0f69..2691390db 100644 --- a/kubernetes/llama-stack/deployment.yaml +++ b/kubernetes/llama-stack/deployment.yaml @@ -8,8 +8,8 @@ spec: app: llamastack template: metadata: - #annotations: - # sidecar.opentelemetry.io/inject: otelsidecar + annotations: + sidecar.opentelemetry.io/inject: llamastack-otelsidecar labels: app: llamastack spec: @@ -47,7 +47,7 @@ spec: - name: OTEL_SERVICE_NAME value: om-llamastack - name: OTEL_TRACE_ENDPOINT - value: 'http://otel-collector-collector.observability-hub.svc.cluster.local:4318/v1/traces' + value: 'http://localhost:4318/v1/traces' - name: SAFETY_MODEL value: meta-llama/Llama-Guard-3-8B - name: SAFETY_VLLM_URL diff --git a/kubernetes/llama-stack/otel-collector-sidecar.yaml b/kubernetes/llama-stack/otel-collector-sidecar.yaml new file mode 100644 index 000000000..43230c43c --- /dev/null +++ b/kubernetes/llama-stack/otel-collector-sidecar.yaml @@ -0,0 +1,58 @@ +# Once this exists, any pod with the template.metadata.annotation below will send metrics +# to observability-hub: +# sidecar.opentelemetry.io/inject: llamastack-otelsidecar +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: llamastack-otelsidecar +spec: + observability: + metrics: {} + deploymentUpdateStrategy: {} + config: + exporters: + debug: {} + otlphttp: + # all sidecars export to the central observability-hub otel-collector, then be + # exported to various backends from there (in-cluster, external 3rd party) + # this is deployed with ../observability/otel-collector manifests + # see ../observability/README.md for how to deploy this collector + endpoint: 'http://otel-collector-collector.observability-hub.svc.cluster.local:4318' + tls: + insecure: true + processors: {} + receivers: + otlp: + protocols: + grpc: {} + http: {} + service: + pipelines: + traces: + exporters: + - debug + - otlphttp + receivers: + - otlp + telemetry: + metrics: + address: '0.0.0.0:8888' + mode: sidecar + resources: {} + podDnsConfig: {} + managementState: managed + upgradeStrategy: automatic + ingress: + route: {} + daemonSetUpdateStrategy: {} + targetAllocator: + allocationStrategy: consistent-hashing + filterStrategy: relabel-config + observability: + metrics: {} + prometheusCR: + scrapeInterval: 30s + resources: {} + replicas: 1 + ipFamilyPolicy: SingleStack + diff --git a/kubernetes/observability/README.md b/kubernetes/observability/README.md new file mode 100644 index 000000000..6bd395f87 --- /dev/null +++ b/kubernetes/observability/README.md @@ -0,0 +1,148 @@ +# Monitor Llamastack & vLLM in OpenShift + +Follow this README to configure an observability stack in OpenShift to visualize Llamastack telemetry and vLLM metrics. +First, ensure Llamastack and vLLM are configured to generate telemetry by following this [configuration guide](./run-configuration.md) + + +## OpenShift Observability Operators + +Operators are available from OperatorHub +The following operators must be installed in order to proceed with this example. + +### Operator descriptions + +1. **Red Hat Build of OpenTelemetry**: The OpenTelemetry Collector (OTC) is provided from this operator. +Metrics and traces will be distributed from the OTC to various backends. Tempo is deployed and is the tracing backend. + +2. **Tempo Operator**: Provides `TempoStack` Custom Resource. This is the backend for distributed tracing. +An S3-compatible storage (Minio) is paired with Tempo. + +3. **Cluster Observability Operator**: This provides PodMonitor and ServiceMonitor Custom Resources which are necessary for +user-workload monitoring's prometheus to scrape workload metrics. Also, the COO provides UIPlugins for viewing telemetry. + +3. **(optional) Grafana Operator**: Provides Grafana APIs including `GrafanaDashboard`, `Grafana`, and `GrafanaDataSource` that will be used to visualize telemetry. + +## Create PodMonitor or ServiceMonitor for any AI Workload that exposes a metrics endpoint + +This is how to enable collection of user-workload metrics for any workload within OpenShift. You need to create a `PodMonitor` or a `ServiceMonitor`. +The PodMonitor will ensure all metrics from pods with matching selectors will be scraped by the user-workload-monitoring Prometheus, and a ServiceMonitor will +scrape from any pod that runs under a particular service. + +* [Example PodMonitor](./podmonitor-example-0.yaml) +* [Example ServiceMonitor](./servicemonitor-example.yaml) + +Upon creation of either, metrics will be scraped and will be visible from the console `Observe -> Metrics` dashboards. + +## Create custom resources and configurations for a central observability hub + +Create the observablity hub namespace `observability-hub`. If a different namespace is created, be sure to update the resource yamls accordingly. + +```bash +oc create ns observability-hub +``` + +### Tracing Backend (Tempo with Minio for S3 storage) + +In order to view distributed tracing data from LLamastack and/or vLLM, you must deploy a tracing backend. The supported tracing backend in OpenShift +is Tempo. See the OpenShift Tempo +[documentation](https://docs.redhat.com/en/documentation/openshift_container_platform/4.18/html/distributed_tracing/distributed-tracing-platform-tempo#distr-tracing-tempo-install-tempostack-web-console_dist-tracing-tempo-installing) +for further details. Tempo must be paired with a storage solution. For this example, `MinIO` is used. The necessary resources can be created by +applying the `./tempo` manifests. + +```bash +# edit storageclassName & secret as necessary +# secret and storage for testing only +oc apply --kustomize ./tempo -n observability-hub +``` + +### OpenTelemetryCollector deployment + +OpenTelemetry Collector is used to aggregate telemetry from various workloads, process individual signals, and export +to various backends. This example will collect traces from various workloads and export all as a single +authenticated stream to the in-cluster TempoStack. For in-cluster only, opentelemetry-collector is not necessary to collect +metrics. Metrics are sent to the in-cluster user-workload-monitoring prometheus by creating the podmonitors and servicemonitors. +However, if exporting off-cluster to a 3rd party observability vendor, the collector is necessary for all signals, +and can provide a single place with which to receive telemetry from various workloads and export as a single authenticated and +secure OTLP stream. + +#### Central OpenTelemetry Collector + +To create a central opentelemetry-collector, update the +[otel-collector/otel-collector.yaml](./otel-collector/otel-collector.yaml) to match your requirements and then apply. + +```bash +oc apply --kustomize ./otel-collector -n observability-hub +``` + +#### OpenTelemetryCollector Sidecars deployment + +You can add individual metrics endpoints to the central otel-collector in observability-hub, but +another way is to add otel-collector sidecar containers to individual deployments throughout the +cluster. Paired with an annotation on the deployment, telemetry will be exported as configured. + +Any deployment with the template.metadata.annotations `sidecar.opentelemetry.io/inject: vllm-otelsidecar` +will receive and export telemetry as configured in the +[otel-collector-vllm-sidecar.yaml](./otel-collector/otel-collector-vllm-sidecar.yaml). + +Any deployment with the template.metadata.annotations `sidecar.opentelemetry.io/inject: llamastack-otelsidecar` +will receive and export telemetry as configured in the +[otel-collector-llamstack-sidecar.yaml](./otel-collector/otel-collector-llamastack-sidecar.yaml). + +The example below will add otel-collector sidecar custom resources to the `llama-serve` namespace, +and upon a scale down, scale up of the deployments with the added annotations, sidecar otel-collector +containers will be added to the pods. + +```bash +oc apply -f ./otel-collector/otel-collector-vllm-sidecar.yaml -n llama-serve +oc apply -f ./otel-collector/otel-collector-llamastack-sidecar.yaml -n llama-serve + +# Then, annotate whatever deployment you'd like to collect telemetry from +# Add the annotation to the deployment's `template.metadata.annotations` from the console. +# OR +# Patch or modify the llamastack and vLLM deployments with the appropriate annotation. +# Replace `deployment-name`, `namespace`, and `name-of-otelsideccar` in the below command. + +oc patch deployment deployment-name \ + -n namespace \ + --type='merge' \ + -p '{"spec":{"template":{"metadata":{"annotations":{"sidecar.opentelemetry.io/inject":"name-of-otelsidecar"}}}}}' +``` + +### Cluster Observability Operator Tracing UIPlugin + +The Jaeger frontend feature of TempoStack is no longer supported by Red Hat. This has been replaced by the COO UIPlugin. To create the UIPlugin for +Tracing, first ensure the TempoStack described above is created. This is a prerequisite. Then, all that's necessary to view traces from +the OpenShift console at `Observe -> Traces` is to create the following [Tracing UIPlugin resource](./tracing-ui-plugin.yaml). + +```bash +oc apply ./tracing-ui-plugin.yaml +``` + +You should now see traces and metrics in the OpenShift console, from the `Oberve` tab. + +### Grafana + +Most users are familiar with Grafana for visualizing and analyzing telemetry. To create the Grafana resources necessary to view +Llamastack and vLLM telemetry, follow the below example. + +This example will deploy a Grafana instance, and Prometheus & Tempo DataSources +The prometheus datasource is the user-workload-monitoring prometheus running in `openshift-user-workload-monitoring` namespace. +The Grafana console is configured with `username: rhel, password: rhel` + +```bash +oc apply -k ./grafana/instance-with-prom-tempo-ds +``` + +Upon success, you can explore metrics and traces from Grafana route. + +#### GrafanaDashboard to visualize cluster metrics and traces + +Check out [github.com/kevchu3/openshift-4-grafana](https://github.com/kevchu3/openshift4-grafana/tree/master/dashboards/crds) for a list of +dashboards to deploy on OpenShift. + +Here's an example to download and deploy a GrafanaDashboard for OpenShift 4.16 cluster metrics. +The dashboard is slightly modified from https://github.com/kevchu3/openshift4-grafana/blob/master/dashboards/json_raw/cluster_metrics.ocp416.json + +```bash +oc apply -n observability-hub -f cluster-metrics-dashboard/cluster-metrics.yaml +``` diff --git a/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster-metrics.yaml b/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster-metrics.yaml new file mode 100644 index 000000000..40c2da8df --- /dev/null +++ b/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster-metrics.yaml @@ -0,0 +1,13 @@ +kind: GrafanaDashboard +apiVersion: grafana.integreatly.org/v1beta1 +metadata: + name: cluster-metrics + labels: + app: grafana +spec: + instanceSelector: + matchLabels: + dashboards: grafana # This label matches the grafana Grafana instance + # This json was copied and modified from https://github.com/kevchu3/openshift4-grafana/blob/master/dashboards/json_raw/cluster_metrics.ocp416.json + url: https://raw.githubusercontent.com/redhat-et/edge-ocp-observability/refs/heads/main/observability-hub/grafana/cluster-metrics-dashboard/cluster_metrics_ocp.json + diff --git a/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster_metrics.ocp.json b/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster_metrics.ocp.json new file mode 100644 index 000000000..8f150d607 --- /dev/null +++ b/kubernetes/observability/grafana/cluster-metrics-dashboard/cluster_metrics.ocp.json @@ -0,0 +1,2178 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 5, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": "prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 53, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Cluster Health", + "type": "row" + }, + { + "dashboardFilter": "", + "dashboardTags": [], + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 51, + "limit": 10, + "nameFilter": "", + "onlyAlertsOnDashboard": true, + "options": { + "alertInstanceLabelFilter": "", + "alertName": "", + "dashboardAlerts": false, + "groupBy": [], + "groupMode": "default", + "maxItems": 20, + "sortOrder": 1, + "stateFilter": { + "error": true, + "firing": true, + "noData": false, + "normal": false, + "pending": true + }, + "viewMode": "list" + }, + "show": "current", + "sortOrder": 1, + "stateFilter": [ + "alerting", + "paused", + "no_data", + "execution_error", + "pending" + ], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 6, + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Alerts Dashboard", + "type": "alertlist" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 6, + "y": 1 + }, + "id": 32, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(cluster:node_instance_type_count:sum{label_node_role_kubernetes_io!=\"master\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Compute Nodes", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Excludes control plane", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 9, + "y": 1 + }, + "id": 34, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(cluster:capacity_cpu_cores:sum{label_node_role_kubernetes_io!=\"master\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Allocatable Cores", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Req", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "code:cluster:ingress_http_request_count:rate5m:sum", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "HTTP {{code}}", + "refId": "A", + "step": 120 + } + ], + "title": "Cluster Http Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Cluster operators in Progress, Degraded, Failing, etc...", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Conditions", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": -0.05, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 49, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "(cluster_operator_conditions{condition!~\"Available|Upgradeable|RetrievedUpdates\"} == 1)", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{name}} ({{condition}})", + "refId": "A" + } + ], + "title": "Cluster Operators by Failed Condition", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Excludes infrastructure namespaces", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 6, + "y": 3 + }, + "id": 38, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "count(kube_namespace_status_phase{phase='Active',namespace!~\"(default|kube|openshift).*\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Application Projects", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Excludes control plane", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 9, + "y": 3 + }, + "id": 36, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(cluster:capacity_memory_bytes:sum{label_node_role_kubernetes_io!=\"master\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Allocatable Memory", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 6, + "y": 5 + }, + "id": 40, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_pod_status_ready{condition=\"true\",namespace!~\"(default|kube|openshift).*\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Application Pods", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Persistent volume claims for applications", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "text", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 9, + "y": 5 + }, + "id": 42, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(kube_persistentvolumeclaim_resource_requests_storage_bytes {namespace!~\"(default|kube|openshift).*\"})/1e+9", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Allocated App Storage", + "type": "stat" + }, + { + "collapsed": false, + "datasource": "prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 30, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Cluster Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Containers in cluster by state, excluding Running and Completed pod status", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsNull", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 8 + }, + "id": 72, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (reason) (kube_pod_container_status_waiting_reason)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{reason}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (reason) (kube_pod_container_status_terminated_reason{reason=~\"ContainerCannotRun|Error|OOMKilled\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{reason}}", + "refId": "B" + } + ], + "title": "Waiting and Terminated Containers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 8 + }, + "id": 105, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket[5m])) by (operation_type, le))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "runtime: {{operation_type}}", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket[5m])) by (operation_type, le))", + "legendFormat": "cgroup: {{operation_type}}", + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket[5m])) by (operation_type, le))", + "legendFormat": "worker: {{operation_type}}", + "refId": "C" + } + ], + "title": "Kubelet Operations Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "µs" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 8 + }, + "id": 66, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "avg by (operation_type) (rate(container_runtime_crio_operations_latency_seconds_total[5m]))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{operation_type}}", + "refId": "A" + } + ], + "title": "Container Runtime Operations Latency", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0.5 + } + ] + }, + "unit": "short" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 14 + }, + "id": 82, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (alertname, severity)(ALERTS{alertname!=\"Watchdog\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{alertname}} ({{severity}})", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (severity)(ALERTS{severity!~\"none|info|warning\"})", + "hide": true, + "legendFormat": "Critical alerts", + "refId": "B" + } + ], + "title": "Alerts by State", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 14 + }, + "id": 100, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "code:apiserver_request_total:rate:sum", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "HTTP {{code}}", + "refId": "A" + } + ], + "title": "APIServer Requests by Status", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 14 + }, + "id": 101, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (verb) (rate(apiserver_request_total[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{verb}}", + "refId": "A" + } + ], + "title": "APIServer Requests by Verb", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "decimals": 0, + "links": [], + "mappings": [], + "max": 1.05, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "transparent", + "value": 1 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 14 + }, + "id": 104, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "etcd_server_has_leader", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Etcd Server has Leader", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": "prometheus", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 20 + }, + "id": 103, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Cluster Capacity", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Kubernetes schedules based on CPU and memory pod requests", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "decimals": 1, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 0.8 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [ + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsZero", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + }, + { + "matcher": { + "id": "byValue", + "options": { + "op": "gte", + "reducer": "allIsNull", + "value": 0 + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": true, + "tooltip": true, + "viz": false + } + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 0, + "y": 21 + }, + "id": 78, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(sum by (pod) (kube_pod_resource_request{resource='cpu',namespace!~\"(default|kube|openshift).*\"}) and count (kube_pod_status_phase{phase=~\"Running|Pending|Unknown\"} == 1) by (pod)) / sum (machine_cpu_cores)", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "cpu.requests", + "refId": "A", + "step": 120 + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(sum by (pod) (kube_pod_resource_request{resource='memory',namespace!~\"(default|kube|openshift).*\"}) and count(kube_pod_status_phase{phase=~\"Running|Pending|Unknown\"} == 1) by (pod)) / sum (machine_memory_bytes)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "memory.requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(sum by (pod) (kube_pod_resource_limit{resource='cpu',namespace!~\"(default|kube|openshift).*\"}) and count (kube_pod_status_phase{phase=~\"Running|Pending|Unknown\"} == 1) by (pod)) / sum (machine_cpu_cores)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "cpu.limits", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(sum by (pod) (kube_pod_resource_limit{resource='memory',namespace!~\"(default|kube|openshift).*\"}) and count(kube_pod_status_phase{phase=~\"Running|Pending|Unknown\"} == 1) by (pod)) / sum (machine_memory_bytes)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "memory.limits", + "range": true, + "refId": "D" + } + ], + "title": "Cluster Pod Requests and Limits", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 200 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 6, + "y": 21 + }, + "id": 47, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "kubelet_running_pods", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{node}}", + "refId": "A", + "step": 120 + } + ], + "title": "Pods per Node", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 12, + "y": 21 + }, + "id": 91, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum by (namespace) (node_namespace_pod:kube_pod_info:{namespace!~\"(openshift).*\"})", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{namespace}}", + "refId": "A", + "step": 120 + } + ], + "title": "Pods per App Namespace", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line+area" + } + }, + "decimals": 1, + "links": [], + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "transparent", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 6, + "x": 18, + "y": 21 + }, + "id": 55, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "pluginVersion": "10.4.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "100 * sort_desc(sum (kubelet_volume_stats_used_bytes) by (persistentvolumeclaim, namespace) / sum (kubelet_volume_stats_capacity_bytes) by (persistentvolumeclaim, namespace))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{namespace}} / {{persistentvolumeclaim}}", + "refId": "A" + } + ], + "title": "Persistent Volume Claim Used %", + "type": "timeseries" + } + ], + "refresh": false, + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "name": "datasource", + "type": "datasource", + "query": "prometheus", + "current": { + "selected": true, + "text": "prometheus", + "value": "prometheus" + }, + "hide": 0, + "label": "Datasource", + "regex": "", + "refresh": 1, + "sort": 0 + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Cluster Metrics", + "uid": "dxkdT-eWz", + "version": 2, + "weekStart": "" +} diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/datasources.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/datasources.yaml new file mode 100644 index 000000000..1ae6defb2 --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/datasources.yaml @@ -0,0 +1,56 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: prometheus +spec: + instanceSelector: + matchLabels: + dashboards: grafana # This label matches the grafana Grafana instance + datasource: + name: prometheus + access: proxy + editable: true + type: prometheus + url: "https://thanos-querier.openshift-monitoring.svc.cluster.local:9091" + isDefault: true + secureJsonData: + "httpHeaderValue1": "Bearer ${token}" + jsonData: + "httpHeaderName1": "Authorization" + "timeInterval": "5s" + "tlsSkipVerify": true + valuesFrom: + - targetPath: "secureJsonData.httpHeaderValue1" + valueFrom: + secretKeyRef: + name: "grafana-sa-token" + key: "token" +--- +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDatasource +metadata: + name: tempo +spec: + instanceSelector: + matchLabels: + dashboards: grafana # This label matches the grafana Grafana instance + datasource: + name: tempo + access: proxy + editable: true + type: tempo + # This is specific to "observability-hub" namespace. If running tempostack elsewhere, need to update + url: "https://tempo-tempostack-gateway.observability-hub.svc.cluster.local:8081/api/traces/v1/dev/tempo" + isDefault: false + secureJsonData: + "httpHeaderValue1": "Bearer ${token}" + jsonData: + "httpHeaderName1": "Authorization" + "timeInterval": "5s" + "tlsSkipVerify": true + valuesFrom: + - targetPath: "secureJsonData.httpHeaderValue1" + valueFrom: + secretKeyRef: + name: "grafana-sa-token" + key: "token" diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/grafana-instance.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/grafana-instance.yaml new file mode 100644 index 000000000..6adb45444 --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/grafana-instance.yaml @@ -0,0 +1,20 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: Grafana +metadata: + name: grafana + labels: + dashboards: grafana +spec: + config: + log: + level: warn + mode: console + security: + admin_password: "rhel" + admin_user: "rhel" + dashboardLabelSelector: + - matchExpressions: + - key: app + operator: In + values: + - grafana diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/grafana-sa.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/grafana-sa.yaml new file mode 100644 index 000000000..1bd9aefbc --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/grafana-sa.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: grafana-sa diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/kustomization.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/kustomization.yaml new file mode 100644 index 000000000..fd291ae32 --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: observability-hub +resources: +- grafana-instance.yaml +- role.yaml +- sa-token-secret.yaml +- grafana-sa.yaml +- route.yaml +- datasources.yaml + diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/role.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/role.yaml new file mode 100644 index 000000000..efcb9dc8e --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/role.yaml @@ -0,0 +1,58 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: cluster-monitoring-view-grafana +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: cluster-monitoring-view +subjects: + - kind: ServiceAccount + name: grafana-sa + # update if not using observability-hub namespace + namespace: observability-hub +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: openshift-cluster-monitoring-view-grafana +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: openshift-cluster-monitoring-view +subjects: + - kind: ServiceAccount + name: grafana-sa + # update if not using observability-hub namespace + namespace: observability-hub +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tempostack-traces-reader-grafana +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tempostack-traces-reader +subjects: + - kind: ServiceAccount + name: grafana-sa + # update if not using observability-hub namespace + namespace: observability-hub +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: edit + # update if not using observability-hub namespace + namespace: observability-hub +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: edit +subjects: +- kind: ServiceAccount + name: grafana-sa + # update if not using observability-hub namespace + namespace: observability-hub diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/route.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/route.yaml new file mode 100644 index 000000000..df8a1f7e3 --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/route.yaml @@ -0,0 +1,14 @@ +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: grafana-route +spec: + to: + kind: Service + name: grafana-service + weight: 100 + port: + targetPort: grafana + tls: + termination: edge + wildcardPolicy: None diff --git a/kubernetes/observability/grafana/instance-with-prom-tempo-ds/sa-token-secret.yaml b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/sa-token-secret.yaml new file mode 100644 index 000000000..b3073e727 --- /dev/null +++ b/kubernetes/observability/grafana/instance-with-prom-tempo-ds/sa-token-secret.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: grafana-sa-token + annotations: + kubernetes.io/service-account.name: grafana-sa +type: kubernetes.io/service-account-token diff --git a/kubernetes/observability/minio-user-cred.yaml b/kubernetes/observability/minio-user-cred.yaml new file mode 100644 index 000000000..d295a7c8f --- /dev/null +++ b/kubernetes/observability/minio-user-cred.yaml @@ -0,0 +1,10 @@ +kind: Secret +apiVersion: v1 +metadata: + name: minio-user-creds +# TEST VALUES ONLY USED IN DEV TESTING +stringData: + MINIO_ROOT_USER: test + MINIO_ROOT_PASSWORD: supersecret +type: Opaque + diff --git a/kubernetes/observability/otel-collector/clusterrole.yaml b/kubernetes/observability/otel-collector/clusterrole.yaml new file mode 100644 index 000000000..7923059d2 --- /dev/null +++ b/kubernetes/observability/otel-collector/clusterrole.yaml @@ -0,0 +1,29 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tempostack-traces-write +rules: + - apiGroups: + - 'tempo.grafana.com' + resources: + - dev + resourceNames: + - traces + verbs: + - 'create' +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tempostack-traces +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tempostack-traces-write +subjects: + - kind: ServiceAccount + name: otel-collector + # update if not using observability-hub namespace + namespace: observability-hub + diff --git a/kubernetes/observability/otel-collector/kustomization.yaml b/kubernetes/observability/otel-collector/kustomization.yaml new file mode 100644 index 000000000..634d19bb3 --- /dev/null +++ b/kubernetes/observability/otel-collector/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: observability-hub +resources: +- sa.yaml +- clusterrole.yaml +- otel-collector.yaml diff --git a/kubernetes/observability/otel-collector/otel-collector-llamastack-sidecar.yaml b/kubernetes/observability/otel-collector/otel-collector-llamastack-sidecar.yaml new file mode 100644 index 000000000..3426c24d9 --- /dev/null +++ b/kubernetes/observability/otel-collector/otel-collector-llamastack-sidecar.yaml @@ -0,0 +1,55 @@ +# Once this exists, any pod with the template.metadata.annotation below will send metrics +# to observability-hub: +# sidecar.opentelemetry.io/inject: llamastack-otelsidecar +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: llamastack-otelsidecar +spec: + observability: + metrics: {} + deploymentUpdateStrategy: {} + config: + exporters: + debug: {} + otlphttp: + # all sidecars can export to the central observability-hub otel-collector, then be + # exported to various backends from there (in-cluster, external 3rd party) + endpoint: 'http://otel-collector-collector.observability-hub.svc.cluster.local:4318' + tls: + insecure: true + processors: {} + receivers: + otlp: + protocols: + grpc: {} + http: {} + service: + pipelines: + traces: + exporters: + - debug + - otlphttp + receivers: + - otlp + telemetry: + metrics: + address: '0.0.0.0:8888' + mode: sidecar + resources: {} + podDnsConfig: {} + managementState: managed + upgradeStrategy: automatic + ingress: + route: {} + daemonSetUpdateStrategy: {} + targetAllocator: + allocationStrategy: consistent-hashing + filterStrategy: relabel-config + observability: + metrics: {} + prometheusCR: + scrapeInterval: 30s + resources: {} + replicas: 1 + ipFamilyPolicy: SingleStack diff --git a/kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml b/kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml new file mode 100644 index 000000000..10750f632 --- /dev/null +++ b/kubernetes/observability/otel-collector/otel-collector-vllm-sidecar.yaml @@ -0,0 +1,71 @@ +# Once this exists, any pod with the template.metadata.annotation below will send metrics +# to observability-hub: +# sidecar.opentelemetry.io/inject: vllm-otelsidecar +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: vllm-otelsidecar + namespace: llama-serve +spec: + observability: + metrics: {} + deploymentUpdateStrategy: {} + config: + exporters: + debug: {} + otlphttp: + # all sidecars can export to the central observability-hub otel-collector, then be + # exported to various backends from there (in-cluster, external 3rd party) + endpoint: 'http://otel-collector-collector.observability-hub.svc.cluster.local:4318' + tls: + insecure: true + processors: {} + receivers: + otlp: + protocols: + grpc: {} + http: {} + prometheus: + config: + scrape_configs: + - job_name: vllm-sidecar + scrape_interval: 15s + static_configs: + - targets: + - 'localhost:8000' + service: + pipelines: + traces: + exporters: + - debug + - otlphttp + receivers: + - otlp + metrics: + exporters: + - debug + - otlphttp + receivers: + - prometheus + - otlp + telemetry: + metrics: + address: '0.0.0.0:8888' + mode: sidecar + resources: {} + podDnsConfig: {} + managementState: managed + upgradeStrategy: automatic + ingress: + route: {} + daemonSetUpdateStrategy: {} + targetAllocator: + allocationStrategy: consistent-hashing + filterStrategy: relabel-config + observability: + metrics: {} + prometheusCR: + scrapeInterval: 30s + resources: {} + replicas: 1 + ipFamilyPolicy: SingleStack diff --git a/kubernetes/observability/otel-collector/otel-collector.yaml b/kubernetes/observability/otel-collector/otel-collector.yaml new file mode 100644 index 000000000..b91fa415b --- /dev/null +++ b/kubernetes/observability/otel-collector/otel-collector.yaml @@ -0,0 +1,122 @@ +--- +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otel-collector +spec: + serviceAccount: otel-collector + config: + extensions: + bearertokenauth: + filename: "/var/run/secrets/kubernetes.io/serviceaccount/token" + + exporters: + debug: + verbosity: basic + #otlphttp/dynatrace: + # update endpoint and Api-Token before deploying + #endpoint: "https://XXXXXXX.live.dynatrace.com/api/v2/otlp" + #headers: + #Authorization: "Api-Token dxxxxxx.XXXXXXXXXXXXXXX" + # Export the dev tenant traces to a Tempo instance + otlphttp/dev: + endpoint: https://tempo-tempostack-gateway.observability-hub.svc.cluster.local:8080/api/traces/v1/dev + tls: + insecure: false + ca_file: "/var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt" + auth: + authenticator: bearertokenauth + headers: + X-Scope-OrgID: "dev" + + receivers: + prometheus: + config: + scrape_configs: + # service/vllm in ns/llama-serve + # add any service.ns.svc.cluster.local:port that includes a /metrics endpoint + # If you use otel-sidecars in each deployment, you do not need to list them here. + - job_name: vllm-llama-serve + scrape_interval: 15s + static_configs: + - targets: + - 'vllm.llama-serve.svc.cluster.local:8000' + # service/safety in ns/llama-serve + - job_name: vllm-safety-serve + scrape_interval: 15s + static_configs: + - targets: + - 'safety.llama-serve.svc.cluster.local:8000' + otlp: + protocols: + grpc: {} + #endpoint: 0.0.0.0:4317 + #tls: + # cert_file: /certs/server.crt + # client_ca_file: /certs/ca.crt + # key_file: /certs/server.key + http: {} + #endpoint: 0.0.0.0:4318 + #tls: + # cert_file: /certs/server.crt + # client_ca_file: /certs/ca.crt + # key_file: /certs/server.key + + processors: + batch: + send_batch_size: 100 + timeout: 1s + # cumulativetodelta necessary to export to dynatrace + # Dynatrace only accepts delta metrics + # OCP user-workload-monitoring only accepts cumulative metrics + #cumulativetodelta: {} + memory_limiter: + check_interval: 5s + limit_percentage: 95 + spike_limit_percentage: 25 + + service: + extensions: + - bearertokenauth + pipelines: + metrics: + exporters: + - debug + #- prometheus/ocp-uwm + #- otlphttp/dynatrace + receivers: + - otlp + - prometheus + processors: + #- cumulativetodelta + - batch + - memory_limiter + traces: + exporters: + - debug + - otlphttp/dev + #- otlphttp/dynatrace + receivers: + - otlp + processors: + - batch + - memory_limiter + telemetry: + metrics: + address: 0.0.0.0:8888 + ingress: + route: + termination: passthrough + type: route + mode: deployment + observability: + metrics: + enableMetrics: true + upgradeStrategy: automatic + #volumeMounts: + #- mountPath: /certs + # name: mtls-certs + #volumes: + #- secret: + # secretName: mtls-certs + # name: mtls-certs diff --git a/kubernetes/observability/otel-collector/sa.yaml b/kubernetes/observability/otel-collector/sa.yaml new file mode 100644 index 000000000..d820a7ded --- /dev/null +++ b/kubernetes/observability/otel-collector/sa.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: otel-collector + diff --git a/kubernetes/observability/podmonitor-example-0.yaml b/kubernetes/observability/podmonitor-example-0.yaml new file mode 100644 index 000000000..eefc64d49 --- /dev/null +++ b/kubernetes/observability/podmonitor-example-0.yaml @@ -0,0 +1,20 @@ +apiVersion: monitoring.rhobs/v1 +kind: PodMonitor +metadata: + name: vllm-llama-serve-monitor +spec: + namespaceSelector: {} + podMetricsEndpoints: + - bearerTokenSecret: + key: "" + interval: 30s + path: /metrics + selector: + matchExpressions: + - key: app + operator: In + values: + - safety + - llama32-3b + - granite-8b + - llama31-70b diff --git a/kubernetes/observability/podmonitor-example-1.yaml b/kubernetes/observability/podmonitor-example-1.yaml new file mode 100644 index 000000000..8e4228d6f --- /dev/null +++ b/kubernetes/observability/podmonitor-example-1.yaml @@ -0,0 +1,15 @@ +apiVersion: monitoring.rhobs/v1 +kind: PodMonitor +metadata: + name: vllm-llama-serve-monitor + namespace: llama-serve +spec: + namespaceSelector: {} + podMetricsEndpoints: + - bearerTokenSecret: + key: '' + interval: 30s + path: /metrics + selector: + matchLabels: + app: vllm # Must match the pod labels diff --git a/kubernetes/observability/run-configuration.md b/kubernetes/observability/run-configuration.md new file mode 100644 index 000000000..6ae6d92c0 --- /dev/null +++ b/kubernetes/observability/run-configuration.md @@ -0,0 +1,122 @@ +## Generate telemetry from Llamastack and vLLM + +### vLLM + +#### metrics + +For vLLM, metrics are generated by default and are exposed at `vllm-endpoint:port/metrics`. For a list of metrics, +you can `curl localhost:8000/metrics` from within a vLLM container. + +#### traces + +It's possible to generate vLLM distributed trace data by updating the vLLM image and start command. This [Containerfile](./vllm-Containerfile) +shows the necessary packages to generate vLLM traces. In the future, these packages may be added to the default vLLM image available from +Red Hat OpenShift AI. + +Here is how you would build vLLM with the tracing packages: + +```bash +podman build --platform x86_64 -t quay.io//vllm:otlp-tracing -f vllm-Containerfile . +podman push quay.io/[your-quay-username]/vllm:otlp-tracing +``` + +Then, add the following updates to the vLLM deployment.yaml. We'll use the [granite-8b deployment](../llama-serve/granite-8b/vllm.yaml): +This example assumes there is an OpenTelemetryCollector with sidecar mode in the same namespace. +See [OpenTelemetryCollector Sidecars Deployment](./README.md#opentelemetrycollector_sidecars_deployment) + + +```yaml +--- + template: + metadata: + labels: + app: granite-8b + annotations: + sidecar.opentelemetry.io/inject: vllm-otelsidecar + spec: + containers: + - args: + - --model + - ibm-granite/granite-3.2-8b-instruct + - --max-model-len + - "128000" + - --enable-auto-tool-choice + - --chat-template + - /app/tool_chat_template_granite.jinja + - --tool-call-parser=granite + - --otlp-traces-endpoint + - 127.0.0.1:4317 + - --collect-detailed-traces + - "all" + - --port + - "8000" + image: 'quay.io//vllm:otlp-tracing' + env: + - name: OTEL_SERVICE_NAME + value: "vllm-granite8b" + - name: OTEL_EXPORTER_OTLP_TRACES_INSECURE + value: "true" +--- +``` + +With the updated vLLM image and the updated deployment, distributed trace data will be generated and collected by the opentelemetry-collector +sidecar container and exported to the central observability-hub as outlined in the [README.md](./README.md) with a `TempoStack` as a tracing backend. +There is a performance impact with enabling tracing with vLLM, so it's recommended to update the deployment to enable tracing only when debugging to +avoid the performance impact. A complete list of vLLM engine arguments can be found [here](https://docs.vllm.ai/en/latest/serving/engine_args.html). + +### Llamastack + +With Llamastack, you need to specify in the run-config.yaml to enable telemetry collection with an opentelemetry receiver. +Don't update these until _after_ the [OpentelemetryCollector Sidecar](./otel-collector/otel-collector-llamastack-sidecar.yaml) +is deployed. Follow the [observability-hub guide](./README.md) +to install the `RH Build of OpenTelemetry Operator` and `OpenTelemetryCollector`. + +#### Updated manifests for telemetry trace collection with opentelemetry receiver endpoint + +This is for traces only. There is a similar `otel_metric` sink and `otel_metric_endpoint`, however, there are currently +only 4 metrics generated within Llamastack, and these are duplicates of what vLLM provides. + +[kubernetes/llama-stack/configmap.yaml](../llama-stack/configmap.yaml) + +```yaml +--- + telemetry: + - provider_id: meta-reference + provider_type: inline::meta-reference + config: + service_name: ${env.OTEL_SERVICE_NAME:llama-stack} + sinks: ${env.TELEMETRY_SINKS:console, otel_trace, sqlite} <-add otel_trace and/or otel_metric + otel_trace_endpoint: ${env.OTEL_TRACE_ENDPOINT:} <-add ONLY if opentelemetry receiver endpoint is available. +--- +``` +And, in [kubernetes/llama-stack/deployment.yaml](../llama-stack/deployment.yaml) + +```yaml +--- + template: + metadata: + labels: + app: llama-stack + annotations: + sidecar.opentelemetry.io/inject: llamastack-otelsidecar + spec: + containers: +--- + env: + - name: OTEL_SERVICE_NAME + value: llamastack + - name: OTEL_TRACE_ENDPOINT + value: http://localhost:4318/v1/traces + #- name: OTEL_METRIC_ENDPOINT + #- value: http://localhost:4318/v1/metrics +--- +``` + +The otel-endpoint is `http://service-name-otc.namespace-of-otc.svc.cluster.local:4318/v1/traces,metrics` if exporting to +central otel-collector. If using otel-collector sidecar, this would be `http://localhost:4318/v1/traces,metrics`. + +Don't update the Llamastack deployment until _after_ the [OpentelemetryCollector Sidecar](./otel-collector/otel-collector-llamastack-sidecar.yaml) +is deployed. + +Now that the configuration changes necessary to generate and export telemetry from vLLM and Llamastack, +follow the [observability-hub guide](./README.md) to view and analyze the data. diff --git a/kubernetes/observability/servicemonitor-example.yaml b/kubernetes/observability/servicemonitor-example.yaml new file mode 100644 index 000000000..1156857a2 --- /dev/null +++ b/kubernetes/observability/servicemonitor-example.yaml @@ -0,0 +1,13 @@ +apiVersion: monitoring.rhobs/v1 +kind: ServiceMonitor +metadata: + name: vllm-llama-serve + namespace: llama-serve +spec: + selector: + matchLabels: + app: vllm # Must match the Service labels + endpoints: + - port: "8000" # Must match the Service port name + path: /metrics # Path to your metrics endpoint + interval: 5s diff --git a/kubernetes/observability/tempo/kustomization.yaml b/kubernetes/observability/tempo/kustomization.yaml new file mode 100644 index 000000000..b72d8ca47 --- /dev/null +++ b/kubernetes/observability/tempo/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: observability-hub +resources: +- tempo-role.yaml +- minio-secret-tempo.yaml +- minio-tempo-pvc.yaml +- minio-tempo-svc.yaml +- minio-tempo.yaml +- tempo-multitenant.yaml diff --git a/kubernetes/observability/tempo/minio-secret-tempo.yaml b/kubernetes/observability/tempo/minio-secret-tempo.yaml new file mode 100644 index 000000000..295a98518 --- /dev/null +++ b/kubernetes/observability/tempo/minio-secret-tempo.yaml @@ -0,0 +1,11 @@ +kind: Secret +apiVersion: v1 +metadata: + name: minio-tempo +# TEST VALUES ONLY USED IN DEV TESTING +stringData: + access_key_id: tempo # notsecret + access_key_secret: supersecret # notsecret + bucket: tempo # notsecret + endpoint: http://minio-tempo.observability-hub.svc.cluster.local:9000 # notsecret +type: Opaque diff --git a/kubernetes/observability/tempo/minio-tempo-pvc.yaml b/kubernetes/observability/tempo/minio-tempo-pvc.yaml new file mode 100644 index 000000000..fa53f6e5e --- /dev/null +++ b/kubernetes/observability/tempo/minio-tempo-pvc.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + finalizers: + - kubernetes.io/pvc-protection + labels: + app.kubernetes.io/name: minio-tempo + name: minio-tempo +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 12Gi + volumeMode: Filesystem + diff --git a/kubernetes/observability/tempo/minio-tempo-svc.yaml b/kubernetes/observability/tempo/minio-tempo-svc.yaml new file mode 100644 index 000000000..c75e95b8c --- /dev/null +++ b/kubernetes/observability/tempo/minio-tempo-svc.yaml @@ -0,0 +1,13 @@ +kind: Service +apiVersion: v1 +metadata: + name: minio-tempo +spec: + ports: + - protocol: TCP + port: 9000 + targetPort: 9000 + internalTrafficPolicy: Cluster + type: ClusterIP + selector: + app.kubernetes.io/name: minio-tempo diff --git a/kubernetes/observability/tempo/minio-tempo.yaml b/kubernetes/observability/tempo/minio-tempo.yaml new file mode 100644 index 000000000..9f7449c1b --- /dev/null +++ b/kubernetes/observability/tempo/minio-tempo.yaml @@ -0,0 +1,54 @@ +kind: Deployment +apiVersion: apps/v1 +metadata: + name: minio-tempo +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: minio-tempo + template: + metadata: + labels: + app.kubernetes.io/name: minio-tempo + spec: + volumes: + - name: storage + persistentVolumeClaim: + claimName: minio-tempo + containers: + - resources: {} + name: minio-tempo + command: + - /bin/sh + - '-c' + - | + mkdir -p /storage/tempo && \ + minio server /storage + env: + # TEST VALUES ONLY USED IN TEST DEV ENV + - name: MINIO_ROOT_USER + valueFrom: + secretKeyRef: + name: minio-user-creds + key: MINIO_ROOT_USER + - name: MINIO_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: minio-user-creds + key: MINIO_ROOT_PASSWORD + ports: + - containerPort: 9000 + protocol: TCP + imagePullPolicy: Always + volumeMounts: + - name: storage + mountPath: /storage + image: quay.io/minio/minio + restartPolicy: Always + dnsPolicy: ClusterFirst + securityContext: {} + strategy: + type: Recreate + progressDeadlineSeconds: 600 + diff --git a/kubernetes/observability/tempo/tempo-multitenant.yaml b/kubernetes/observability/tempo/tempo-multitenant.yaml new file mode 100644 index 000000000..ebcf8663f --- /dev/null +++ b/kubernetes/observability/tempo/tempo-multitenant.yaml @@ -0,0 +1,27 @@ +# based on config/samples/openshift/tempo_v1alpha1_multitenancy.yaml +apiVersion: tempo.grafana.com/v1alpha1 +kind: TempoStack +metadata: + name: tempostack +spec: + storage: + secret: + name: minio-tempo + type: s3 + storageSize: 15Gi + resources: + total: + limits: + memory: 10Gi + cpu: 5000m + tenants: + mode: openshift + authentication: + - tenantName: dev + tenantId: "1610b0c3-c509-4592-a256-a1871353dbfa" + template: + gateway: + enabled: true + queryFrontend: + jaegerQuery: + enabled: true diff --git a/kubernetes/observability/tempo/tempo-role.yaml b/kubernetes/observability/tempo/tempo-role.yaml new file mode 100644 index 000000000..be1841d8e --- /dev/null +++ b/kubernetes/observability/tempo/tempo-role.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: tempostack-traces-reader +rules: + - apiGroups: + - 'tempo.grafana.com' + resources: + - dev + resourceNames: + - traces + verbs: + - 'get' +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: tempostack-traces-reader +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: tempostack-traces-reader +subjects: + - kind: Group + apiGroup: rbac.authorization.k8s.io + name: system:authenticated + diff --git a/kubernetes/observability/tracing-ui-plugin.yaml b/kubernetes/observability/tracing-ui-plugin.yaml new file mode 100644 index 000000000..9791b61c3 --- /dev/null +++ b/kubernetes/observability/tracing-ui-plugin.yaml @@ -0,0 +1,8 @@ +apiVersion: observability.openshift.io/v1alpha1 +kind: UIPlugin +metadata: + # It will not work with any other name + name: distributed-tracing +spec: + type: DistributedTracing + diff --git a/kubernetes/observability/vllm-Containerfile b/kubernetes/observability/vllm-Containerfile new file mode 100644 index 000000000..3bce23fdd --- /dev/null +++ b/kubernetes/observability/vllm-Containerfile @@ -0,0 +1,9 @@ +# Use the vllm-openai image as the base +FROM docker.io/vllm/vllm-openai:v0.7.3 + +# Install OpenTelemetry packages +RUN pip install \ + "opentelemetry-sdk>=1.26.0,<1.27.0" \ + "opentelemetry-api>=1.26.0,<1.27.0" \ + "opentelemetry-exporter-otlp>=1.26.0,<1.27.0" \ + "opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0"