diff --git a/charts/kubeflow-trainer/README.md b/charts/kubeflow-trainer/README.md index 36d40c19af..792070c1aa 100644 --- a/charts/kubeflow-trainer/README.md +++ b/charts/kubeflow-trainer/README.md @@ -109,6 +109,19 @@ manager: traffic.sidecar.istio.io/excludeInboundPorts: "9443" ``` +### Grafana dashboard + +The chart can optionally install a default Grafana dashboard (as a ConfigMap) for controller health and TrainJob controller activity. +This is disabled by default and is intended for clusters that run Grafana with a dashboard sidecar that imports dashboards from labeled ConfigMaps. + +```bash +helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \ + --version 2.1.0 \ + --set grafanaDashboard.enabled=true +``` + +If your Grafana sidecar uses a different label selector, override `grafanaDashboard.labels` accordingly. + ## Values | Key | Type | Default | Description | @@ -141,6 +154,10 @@ manager: | manager.config.statusServer.qps | int | `5` | QPS rate limit for the TrainJob Status Server api client | | manager.config.statusServer.burst | int | `10` | Burst rate limit for the TrainJob Status Server api client | | webhook.failurePolicy | string | `"Fail"` | Specifies how unrecognized errors are handled. Available options are `Ignore` or `Fail`. | +| grafanaDashboard | object | `{"annotations":{},"enabled":false,"labels":{"grafana_dashboard":"1"}}` | | +| grafanaDashboard.enabled | bool | `false` | Whether to install the default Grafana dashboard ConfigMap. This is intended for environments that run Grafana with a dashboard sidecar (ConfigMap import). | +| grafanaDashboard.labels | object | `{"grafana_dashboard":"1"}` | Labels applied to the dashboard ConfigMap. Set this to match your Grafana sidecar label selector. | +| grafanaDashboard.annotations | object | `{}` | Optional annotations applied to the dashboard ConfigMap. Example: set a folder via grafana sidecar conventions. | | dataCache.enabled | bool | `false` | Enable/disable data cache support (LWS dependency, ClusterRole). Set to `true` to install data cache components. | | dataCache.lws.install | bool | `true` | Whether to install LeaderWorkerSet as a dependency. Set to `false` if LeaderWorkerSet is already installed in the cluster. | | dataCache.lws.fullnameOverride | string | `"lws"` | String to fully override LeaderWorkerSet release name. | diff --git a/charts/kubeflow-trainer/README.md.gotmpl b/charts/kubeflow-trainer/README.md.gotmpl index 46f45a7ef1..9862ad227b 100644 --- a/charts/kubeflow-trainer/README.md.gotmpl +++ b/charts/kubeflow-trainer/README.md.gotmpl @@ -127,6 +127,19 @@ manager: traffic.sidecar.istio.io/excludeInboundPorts: "9443" ``` +### Grafana dashboard + +The chart can optionally install a default Grafana dashboard (as a ConfigMap) for controller health and TrainJob controller activity. +This is disabled by default and is intended for clusters that run Grafana with a dashboard sidecar that imports dashboards from labeled ConfigMaps. + +```bash +helm install kubeflow-trainer oci://ghcr.io/kubeflow/charts/kubeflow-trainer \ + --version 2.1.0 \ + --set grafanaDashboard.enabled=true +``` + +If your Grafana sidecar uses a different label selector, override `grafanaDashboard.labels` accordingly. + {{ template "chart.valuesSection" . }} {{- define "chart.maintainersTable" -}} diff --git a/charts/kubeflow-trainer/dashboards/kubeflow-trainer-dashboard.json b/charts/kubeflow-trainer/dashboards/kubeflow-trainer-dashboard.json new file mode 100644 index 0000000000..67eecb483c --- /dev/null +++ b/charts/kubeflow-trainer/dashboards/kubeflow-trainer-dashboard.json @@ -0,0 +1,619 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Platform / Controller health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Whether Prometheus can scrape the controller-manager target.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "0": { + "text": "Down" + }, + "1": { + "text": "Up" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "background", + "graphMode": "none", + "justifyMode": "center", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "targets": [ + { + "editorMode": "code", + "expr": "max(up{namespace=~\"$namespace\", job=~\"$job\"})", + "instant": true, + "legendFormat": "", + "range": false, + "refId": "A" + } + ], + "title": "Controller scrape up", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Go goroutines in the controller-manager process.", + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "max(go_goroutines{namespace=~\"$namespace\", job=~\"$job\"})", + "legendFormat": "goroutines", + "range": true, + "refId": "A" + } + ], + "title": "Goroutines", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Resident memory usage of the controller-manager process.", + "fieldConfig": { + "defaults": { + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "max(process_resident_memory_bytes{namespace=~\"$namespace\", job=~\"$job\"})", + "legendFormat": "rss", + "range": true, + "refId": "A" + } + ], + "title": "Memory (RSS)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Total reconcile rate per controller and result.", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 5 + }, + "id": 5, + "options": { + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by(controller, result) (rate(controller_runtime_reconcile_total{namespace=~\"$namespace\", job=~\"$job\", controller=~\"$controller\"}[5m]))", + "legendFormat": "{{controller}} / {{result}}", + "range": true, + "refId": "A" + } + ], + "title": "Reconciles / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "P95 reconcile duration by controller (seconds).", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 6, + "options": { + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le, controller) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=~\"$namespace\", job=~\"$job\", controller=~\"$controller\"}[5m])))", + "legendFormat": "{{controller}}", + "range": true, + "refId": "A" + } + ], + "title": "Reconcile duration p95", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 7, + "panels": [], + "title": "Platform / Queue & backlog", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Workqueue depth by queue name (client-go workqueue).", + "fieldConfig": { + "defaults": { + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 8, + "options": { + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "max by(name) (workqueue_depth{namespace=~\"$namespace\", job=~\"$job\"})", + "legendFormat": "{{name}}", + "range": true, + "refId": "A" + } + ], + "title": "Workqueue depth", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "Rate at which items are re-queued (retries) per workqueue.", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 9, + "options": { + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by(name) (rate(workqueue_retries_total{namespace=~\"$namespace\", job=~\"$job\"}[5m]))", + "legendFormat": "{{name}}", + "range": true, + "refId": "A" + } + ], + "title": "Workqueue retries / sec", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 10, + "panels": [], + "title": "ML / TrainJob lifecycle (available today)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "TrainJob controller reconcile rate by result. Until dedicated TrainJob lifecycle metrics exist, this is the most reliable out-of-the-box signal of activity.", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 11, + "options": { + "legend": { + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum by(result) (rate(controller_runtime_reconcile_total{namespace=~\"$namespace\", job=~\"$job\", controller=\"trainjob_controller\"}[5m]))", + "legendFormat": "{{result}}", + "range": true, + "refId": "A" + } + ], + "title": "TrainJob reconciles / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "TrainJob controller reconcile error rate.", + "fieldConfig": { + "defaults": { + "unit": "ops" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 12, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "sum(rate(controller_runtime_reconcile_total{namespace=~\"$namespace\", job=~\"$job\", controller=\"trainjob_controller\", result=\"error\"}[5m]))", + "legendFormat": "errors", + "range": true, + "refId": "A" + } + ], + "title": "TrainJob reconcile errors / sec", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "P95 TrainJob reconcile duration (seconds).", + "fieldConfig": { + "defaults": { + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 13, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "editorMode": "code", + "expr": "histogram_quantile(0.95, sum by(le) (rate(controller_runtime_reconcile_time_seconds_bucket{namespace=~\"$namespace\", job=~\"$job\", controller=\"trainjob_controller\"}[5m])))", + "legendFormat": "p95", + "range": true, + "refId": "A" + } + ], + "title": "TrainJob reconcile duration p95", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ + "kubeflow", + "trainer", + "observability", + "controller-runtime" + ], + "templating": { + "list": [ + { + "hide": 0, + "includeAll": false, + "label": "Prometheus", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "selected": true, + "text": "All", + "value": ".*" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(up, namespace)", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(up, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "All", + "value": ".*" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(up{namespace=~\"$namespace\"}, job)", + "hide": 0, + "includeAll": true, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": { + "query": "label_values(up{namespace=~\"$namespace\"}, job)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": ".*trainer.*", + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "All", + "value": ".*" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\", job=~\"$job\"}, controller)", + "hide": 0, + "includeAll": true, + "label": "Controller", + "multi": true, + "name": "controller", + "options": [], + "query": { + "query": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\", job=~\"$job\"}, controller)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "Kubeflow Trainer - Controller health & TrainJob lifecycle", + "uid": "kubeflow-trainer-controller", + "version": 1, + "weekStart": "" +} diff --git a/charts/kubeflow-trainer/templates/grafana-dashboard-configmap.yaml b/charts/kubeflow-trainer/templates/grafana-dashboard-configmap.yaml new file mode 100644 index 0000000000..1462e87a06 --- /dev/null +++ b/charts/kubeflow-trainer/templates/grafana-dashboard-configmap.yaml @@ -0,0 +1,20 @@ +{{- if .Values.grafanaDashboard.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "trainer.fullname" . }}-grafana-dashboard + namespace: {{ .Release.Namespace }} + labels: + {{- include "trainer.labels" . | nindent 4 }} + app.kubernetes.io/component: grafana-dashboard + {{- with .Values.grafanaDashboard.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with .Values.grafanaDashboard.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +data: + kubeflow-trainer-dashboard.json: |- +{{ .Files.Get "dashboards/kubeflow-trainer-dashboard.json" | nindent 4 }} +{{- end }} diff --git a/charts/kubeflow-trainer/values.yaml b/charts/kubeflow-trainer/values.yaml index 6d32e47214..b44cc380ed 100644 --- a/charts/kubeflow-trainer/values.yaml +++ b/charts/kubeflow-trainer/values.yaml @@ -152,6 +152,20 @@ webhook: # Available options are `Ignore` or `Fail`. failurePolicy: Fail +grafanaDashboard: + # -- Whether to install the default Grafana dashboard ConfigMap. + # This is intended for environments that run Grafana with a dashboard sidecar (ConfigMap import). + enabled: false + + # -- Labels applied to the dashboard ConfigMap. + # Set this to match your Grafana sidecar label selector. + labels: + grafana_dashboard: "1" + + # -- Optional annotations applied to the dashboard ConfigMap. + # Example: set a folder via grafana sidecar conventions. + annotations: {} + dataCache: # -- Enable/disable data cache support (LWS dependency, ClusterRole). # Set to `true` to install data cache components.