Description
Describe the bug a clear and concise description of what the bug is.
I try to get the cluster dashboard to work, however, the metrics
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
seems to be unavailable.
I can see that container_cpu_usage_seconds_total is available so I guess the part before the colon must be some kind of renaming happening somewhere but I can't see the sum_irate part anywhere.
I guess it has to do with the fact that I am unable to use kubelet serviceMonitor as I have insufficiend rights to create objects in kube-system namespace so I created a custom scrape config to scrape kubelet and cadvisor.
What's your helm version?
v3.17.2
What's your kubectl version?
1.30.9
Which chart?
kube-prometheus-stack
What's the chart version?
70.3.0
What happened?
No response
What you expected to happen?
A documentation of the desired metric or where the renamings happen or a link to an external reference describing this behaviour.
How to reproduce it?
I used the following custom scrape config for kubelet and cadvisor:
Enter the changed values of values.yaml?
coreDns:
enabled: false
kubeDns:
enabled: false
kubeApiServer:
enabled: true
tlsConfig:
serverName:
serviceMonitor:
enabled: true
jobLabel: name
selector:
matchLabels:
app: kube-apiserver
kubeControllerManager:
service:
enabled: false
serviceMonitor:
port: metrics
jobLabel: name
selector:
matchLabels:
app: kube-controller-manager
kubeEtcd:
service:
enabled: false
serviceMonitor:
port: metrics
jobLabel: name
selector:
matchLabels:
app: etcd
kubeProxy:
enabled: false
service:
enabled: false
kubeApiserver:
enabled: true
serviceMonitor:
enabled: true
jobLabel: name
selector:
matchLabels:
app: kube-apiserver
kubelet:
enabled: false
serviceMonitor:
enabled: false
kubeScheduler:
service:
enabled: false
serviceMonitor:
port: metrics
jobLabel: name
selector:
matchLabels:
app: kube-scheduler
nodeExporter:
enabled: false
prometheusOperator:
kubeletService:
enabled: false
deployment:
logLevel: debug
image:
registry: "registry.proxy/publicquay"
admissionWebhooks:
deployment:
image:
registry: "registry.proxy/publicquay"
patch:
image:
registry: "registry.proxy/k8scache"
deployment:
image:
registry: "registry.proxy/publicquay"
admissionWebHook:
patch:
image:
registry: "registry.proxy/publicquay"
prometheusConfigReloader:
image:
registry: "registry.proxy/publicquay"
prometheusSpec:
podMonitorSelector:
matchLabels:
podMonitor: mystructure
image:
registry: "registry.proxy/publicquay"
thanosRulerSpec:
image:
registry: "registry.proxy/publicquay"
thanosImage:
registry: "registry.proxy/publicquay"
kube-state-metrics:
image:
registry: "registry.proxy/k8scache"
prometheus:
prometheusSpec:
image:
registry: "registry.proxy/publicquay"
retention: 30d
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: custom-storage-class
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi
serviceMonitorSelector:
matchLabels:
# discover all service monitors. label selectors are ANDed which means we either have to use this helm charts values
# or we manipulate this helm charts labels.
podMonitorSelector:
matchLabels:
# discover all pod monitors. label selectors are ANDed which means we either have to use this helm charts values
# or we manipulate this helm charts labels.
additionalScrapeConfigs:
- job_name: kubernetes-nodes
honor_timestamps: true
track_timestamps_staleness: false
scrape_interval: 1m
scrape_timeout: 20s
scrape_protocols:
- OpenMetricsText1.0.0
- OpenMetricsText0.0.1
- PrometheusText0.0.4
metrics_path: /metrics
scheme: https
enable_compression: true
authorization:
type: Bearer
credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
follow_redirects: true
enable_http2: true
http_headers: null
relabel_configs:
- action: replace
source_labels: [metrics_path]
target_label: metrics_path
- separator: ;
regex: (.)
target_label: address
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
separator: ;
regex: (.+)
target_label: metrics_path
replacement: /api/v1/nodes/$1/proxy/metrics
action: replace
metric_relabel_configs:
- action: drop
regex: (csi_operations|storage_operation_duration)_seconds_bucket;(0.25|2.5|15|25|120|600)(.0)?
source_labels: [name, le]
kubernetes_sd_configs:
- role: node
kubeconfig_file: ""
follow_redirects: true
enable_http2: true
http_headers: null
- job_name: "kubernetes-cadvisor"
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
authorization:
type: Bearer
credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token
scheme: https
kubernetes_sd_configs:
- role: node
relabel_configs:
- separator: ;
regex: (.)
target_label: address
replacement: kubernetes.default.svc:443
action: replace
- source_labels: [meta_kubernetes_node_name]
regex: (.+)
target_label: metrics_path
replacement: /api/v1/nodes/$1/proxy/metrics/cadvisor
action: replace
metric_relabel_configs:
- action: drop
regex: container_cpu(cfs_throttled_seconds_total|load_average_10s|system_seconds_total|user_seconds_total)
source_labels: [name]
- action: drop
regex: container_fs(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)
source_labels: [name]
- action: drop
regex: container_memory_(mapped_file|swap)
source_labels: [name]
- action: drop
regex: container_(file_descriptors|tasks_state|threads_max)
source_labels: [name]
- action: drop
regex: container_memory_failures_total;hierarchy
source_labels: [name, scope]
- action: drop
regex: container_network_.;(cali|cilium|cni|lxc|nodelocaldns|tunl).
source_labels: [name, interface]
- action: drop
regex: container_spec.*
source_labels: [name]
- action: drop
regex: .+;
source_labels: [id, pod]
- job_name: node-exporter
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
scheme: http
kubernetes_sd_configs:
- role: pod
namespaces:
own_namespace: false
names:
- kube-system
selectors:
- role: pod
field: "spec.serviceAccountName=custom-node-exporter"
additionalRulesForClusterRole:
- apiGroups:
- ""
resources:
- nodes
- nodes/proxy
- nodes/metrics
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- nonResourceURLs:
- /metrics
- /metrics/cadvisor
verbs:
- get
alertmanager:
alertmanagerSpec:
image:
registry: "registry.proxy/publicquay"
grafana:
admin:
existingSecret: "grafana-adminuser-secret"
userKey: admin-user
passwordKey: admin-password
grafana.ini:
server:
root_url: https://example.com/grafana/
extraObjects:
- apiVersion: v1
kind: Secret
metadata:
name: grafana-adminuser-secret
type: Opaque
data:
admin-user: ${grafana_admin_username}
admin-password: ${grafana_admin_password}
- apiVersion: traefik.containo.us/v1alpha1
kind: Middleware
metadata:
labels:
expose_via: frontend
name: grafana-strip-path
spec:
stripPrefix:
prefixes:
- /grafana
- apiVersion: traefik.containo.us/v1alpha1
metadata:
name: grafana-ingressroute-{{ .Release.Name }}
labels:
expose_via: frontend
kind: IngressRoute
spec:
entryPoints:
- websecure
routes:
- kind: Rule
match: Host(example.com
) && PathPrefix(/grafana
)
middlewares:
- name: grafana-strip-path
services:
- kind: Service
name: kube-prometheus-stack-grafana
port: 80
tls:
domains:
- main: example.com
image:
registry: "registry.proxy/dockerhub"
sidecar:
image:
registry: "registry.proxy/publicquay"
crds:
upgradeJob:
image:
busyBox:
registry: "registry.proxy/dockerhub"
kubectl:
registry: "registry.proxy/k8scache"
Enter the command that you execute and failing/misfunctioning.
no specific command
Anything else we need to know?
No response