added extra pipeline for metrics monitoring (#416)

obs-gh-virjramakrishnan · web-flow · commit 24e95f11c642 · 2025-11-18T15:03:52.000-08:00
* add extra pipeline for kubeletstats monitoring in fargate pods

* feat: OB-41415 send sidecar metrics to observe directly and restructure config for extensibility

Customers have been requesting that we support EKS fargate hosted clusters.
To do this, I add a new fargate mode (off my default) that will install an
otel operator, which will use a sidecar container to query metrics from the
pod it is attached to.
diff --git a/charts/agent/Chart.yaml b/charts/agent/Chart.yaml
@@ -2,7 +2,7 @@ apiVersion: v2
 name: agent
 description: Chart to install K8s collection stack based on Observe Agent
 type: application
-version: 0.74.4
+version: 0.75.0
 appVersion: "2.10.1"
 dependencies:
   - name: opentelemetry-collector
diff --git a/charts/agent/README.md b/charts/agent/README.md
@@ -1,6 +1,6 @@
 # agent
 
-![Version: 0.74.4](https://img.shields.io/badge/Version-0.74.4-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.10.1](https://img.shields.io/badge/AppVersion-2.10.1-informational?style=flat-square)
+![Version: 0.75.0](https://img.shields.io/badge/Version-0.75.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.10.1](https://img.shields.io/badge/AppVersion-2.10.1-informational?style=flat-square)
 
 Chart to install K8s collection stack based on Observe Agent
 
@@ -30,6 +30,10 @@ This service is a *daemonset* which means it runs on every node in the cluster.
 
 This service is a *single-instance deployment*. It's critical that this service is only a single instance since otherwise it would produce duplicate data. It is responsible for monitoring the other containers of Observe Agent running by scraping the exposed Prometheus metrics of those agents. It's best practice to separate the monitoring of the agents from the agents themselves since if problems develop in those pipelines, we would need the agent telemetry to keep flowing in order to diagnose.
 
+## fargate-collector
+
+This service is an *OpenTelemetryCollector*, a custom resource that is managed by a OpenTelemetry Operator (must be installed separately) It is responsible for collecting metrics from nodes when running on AWS Fargate. It injects a sidecar into every pod with the appropriate annotation, and scrapes the API of the kubelet of that node for metrics. Daemonsets are not allowed on fargate, so this service is intended as a replacement for the usual approach for node metric collection with the `node-logs-metrics` daemonset.
+
 ## Maintainers
 
 | Name | Email | Url |
@@ -599,6 +603,10 @@ This service is a *single-instance deployment*. It's critical that this service
 | node.metrics.fileSystem.excludeMountPoints | string | `"[\"/dev/*\",\"/proc/*\",\"/sys/*\",\"/run/k3s/containerd/*\",\"/var/lib/docker/*\",\"/var/lib/kubelet/*\",\"/snap/*\"]"` |  |
 | node.metrics.fileSystem.rootPath | string | `"/hostfs"` |  |
 | node.metrics.interval | string | `"60s"` |  |
+| nodeless.enabled | bool | `false` | Enables nodeless mode. Nodeless mode is intended for environments where daemonsets are not supported. |
+| nodeless.hostingPlatform | string | `""` | The hosting platform for the nodeless mode. Valid values are "fargate". |
+| nodeless.metrics.enabled | bool | `false` |  |
+| nodeless.serviceAccounts | object | `{}` | A map of namespaces to lists of service accounts. If you provide service accounts here we will attach a cluster role and binding granting the service accounts permission to the relevant Kubernetes APIs needed to collect metrics. If empty, you will need to manually grant the service accounts the necessary permissions. Example:   serviceAccounts:     default: ["app1-sa", "app2-sa"]     fargate-ns: ["fargate-app-sa"] |
 | observe.collectionEndpoint.value | string | `""` |  |
 | observe.entityToken.create | bool | `false` |  |
 | observe.entityToken.use | bool | `false` |  |
diff --git a/charts/agent/README.md.gotmpl b/charts/agent/README.md.gotmpl
@@ -31,6 +31,10 @@ This service is a *daemonset* which means it runs on every node in the cluster.
 
 This service is a *single-instance deployment*. It's critical that this service is only a single instance since otherwise it would produce duplicate data. It is responsible for monitoring the other containers of Observe Agent running by scraping the exposed Prometheus metrics of those agents. It's best practice to separate the monitoring of the agents from the agents themselves since if problems develop in those pipelines, we would need the agent telemetry to keep flowing in order to diagnose.
 
+## fargate-collector
+
+This service is an *OpenTelemetryCollector*, a custom resource that is managed by a OpenTelemetry Operator (must be installed separately) It is responsible for collecting metrics from nodes when running on AWS Fargate. It injects a sidecar into every pod with the appropriate annotation, and scrapes the API of the kubelet of that node for metrics. Daemonsets are not allowed on fargate, so this service is intended as a replacement for the usual approach for node metric collection with the `node-logs-metrics` daemonset.
+
 {{ template "chart.homepageLine" . }}
 
 {{ template "chart.maintainersSection" . }}
diff --git a/charts/agent/templates/_config-processors.tpl b/charts/agent/templates/_config-processors.tpl
@@ -148,6 +148,14 @@ attributes/debug_source_cadvisor_metrics:
 {{- end -}}
 {{- end -}}
 
+{{- define "config.processors.attributes.sidecar_kubeletstats_metrics" -}}
+attributes/debug_source_sidecar_kubeletstats_metrics:
+  actions:
+    - key: debug_source
+      action: insert
+      value: sidecar_kubeletstats_metrics
+{{- end -}}
+
 {{- define "config.processors.attributes.drop_container_info" -}}
 resource/drop_container_info:
   attributes:
@@ -162,6 +170,21 @@ resource/drop_service_name:
       key: service.name
 {{- end -}}
 
+{{- define "config.processors.metricstransform.duplicate_k8s_cpu_metrics" -}}
+# convert new k8s metric names to the names our Kubernetes Explorer relies on
+metricstransform/duplicate_k8s_cpu_metrics:
+  transforms:
+    - include: container.cpu.usage
+      action: insert
+      new_name: container.cpu.utilization
+    - include: k8s.pod.cpu.usage
+      action: insert
+      new_name: k8s.pod.cpu.utilization
+    - include: k8s.node.cpu.usage
+      action: insert
+      new_name: k8s.node.cpu.utilization
+{{- end -}}
+
 {{- define "config.processors.filter.drop_long_spans" -}}
 {{- if eq .Values.node.forwarder.traces.maxSpanDuration "none" }}
 {{- else if (regexMatch "^[0-9]+(ns|us|ms|s|m|h)$" .Values.node.forwarder.traces.maxSpanDuration) }}
diff --git a/charts/agent/templates/_config.tpl b/charts/agent/templates/_config.tpl
@@ -12,6 +12,14 @@
 {{- toYaml $config | indent 2 }}
 {{- end }}
 
+{{- define "observe.sidecar.applyFargateSidecarConfig" -}}
+{{- $values := deepCopy .Values }}
+{{- $data := dict "Values" $values | mustMergeOverwrite (deepCopy .) }}
+{{- $config := mustMergeOverwrite ( include "observe.sidecar.FargateSidecar.config" $data |  fromYaml ) ($values.agent.config.FargateSidecar) ($values.agent.config.global.overrides) -}}
+{{- toYaml $config | indent 2 }}
+{{- end }}
+
+
 {{- define "observe.deployment.applyPrometheusScraperConfig" -}}
 {{- $values := deepCopy .Values }}
 {{- $data := dict "Values" $values | mustMergeOverwrite (deepCopy .) }}
diff --git a/charts/agent/templates/_fargate-sidecar-config.tpl b/charts/agent/templates/_fargate-sidecar-config.tpl
@@ -0,0 +1,40 @@
+{{- define "observe.sidecar.FargateSidecar.config" -}}
+
+receivers:
+{{- include "observe.kubeletstats.receiver" (dict "Values" .Values "endpoint" "https://kubernetes.default.svc/api/v1/nodes/${env:K8S_NODE_NAME}/proxy") | nindent 2 }}
+
+processors:
+
+{{- include "config.processors.memory_limiter" . | nindent 2 }}
+{{- include "config.processors.batch" . | nindent 2 }}
+{{- include "config.processors.resource_detection.cloud" . | nindent 2 }}
+{{- include "config.processors.attributes.k8sattributes" . | nindent 2 }}
+{{- include "config.processors.resource.observe_common" . | nindent 2 }}
+{{- include "config.processors.deltatocumulative" . | nindent 2 }}
+{{- include "config.processors.attributes.add_empty_service_attributes" . | nindent 2 }}
+{{- include "config.processors.metricstransform.duplicate_k8s_cpu_metrics" . | nindent 2 }}
+{{- include "config.processors.attributes.sidecar_kubeletstats_metrics" . | nindent 2 }}
+
+exporters:
+{{- include "config.exporters.debug" . | nindent 2 }}
+{{- include "config.exporters.prometheusremotewrite" . | nindent 2 }}
+
+{{ $kubeletstatsExporters := (list "prometheusremotewrite/observe") -}}
+
+{{- if eq .Values.agent.config.global.debug.enabled true }}
+  {{- $kubeletstatsExporters = concat $kubeletstatsExporters ( list "debug/override" ) | uniq }}
+{{- end }}
+
+# in the future, we may add other pipelines, and the failure condition should change to
+# being that no telemetry collection was enabled
+service:
+  pipelines:
+    {{- if .Values.nodeless.metrics.enabled }}
+      metrics/kubeletstats:
+        receivers: [kubeletstats]
+        processors: [memory_limiter, metricstransform/duplicate_k8s_cpu_metrics, k8sattributes, deltatocumulative/observe, batch, resourcedetection/cloud, resource/observe_common, attributes/debug_source_sidecar_kubeletstats_metrics]
+        exporters: [{{ join ", " $kubeletstatsExporters }}]
+    {{- else }}
+      {{- fail "nodeless.metrics.enabled must be true for Fargate sidecar - otherwise no telemetry will be collected" }}
+    {{- end }}
+{{- end }}
diff --git a/charts/agent/templates/_kubeletstats-settings.tpl b/charts/agent/templates/_kubeletstats-settings.tpl
@@ -0,0 +1,55 @@
+{{- define "observe.kubeletstats.receiver" -}}
+kubeletstats:
+  collection_interval: {{.Values.node.containers.metrics.interval}}
+  auth_type: 'serviceAccount'
+  endpoint: {{ .endpoint }}
+  node: '${env:K8S_NODE_NAME}'
+  insecure_skip_verify: true
+  k8s_api_config:
+      auth_type: serviceAccount
+  metric_groups:
+    - node
+    - pod
+    - container
+  metrics:
+    # The following metrics are optional and must be enabled manually as per:
+    # https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/documentation.md#optional-metrics
+    container.cpu.usage:
+      enabled: true
+    container.uptime:
+      enabled: true
+    k8s.container.cpu.node.utilization:
+      enabled: true
+    k8s.container.cpu_limit_utilization:
+      enabled: true
+    k8s.container.cpu_request_utilization:
+      enabled: true
+    k8s.container.memory.node.utilization:
+      enabled: true
+    k8s.container.memory_limit_utilization:
+      enabled: true
+    k8s.container.memory_request_utilization:
+      enabled: true
+    k8s.node.cpu.usage:
+      enabled: true
+    k8s.node.uptime:
+      enabled: true
+    k8s.pod.cpu.node.utilization:
+      enabled: true
+    k8s.pod.cpu.usage:
+      enabled: true
+    k8s.pod.cpu_limit_utilization:
+      enabled: true
+    k8s.pod.cpu_request_utilization:
+      enabled: true
+    k8s.pod.memory.node.utilization:
+      enabled: true
+    k8s.pod.memory_limit_utilization:
+      enabled: true
+    k8s.pod.memory_request_utilization:
+      enabled: true
+    k8s.pod.uptime:
+      enabled: true
+  extra_metadata_labels:
+    - container.id
+{{- end }}
diff --git a/charts/agent/templates/_node-logs-metrics-config.tpl b/charts/agent/templates/_node-logs-metrics-config.tpl
@@ -70,59 +70,13 @@ receivers:
       network: null
   {{ end -}}
   {{- if .Values.node.containers.metrics.enabled }}
-  kubeletstats:
-    collection_interval: {{.Values.node.containers.metrics.interval}}
-    auth_type: 'serviceAccount'
-    endpoint: {{ if .Values.node.kubeletstats.useNodeIp }}"${env:K8S_NODE_IP}:10250"{{ else }}"${env:K8S_NODE_NAME}:10250"{{ end }}
-    node: '${env:K8S_NODE_NAME}'
-    insecure_skip_verify: true
-    k8s_api_config:
-        auth_type: serviceAccount
-    metric_groups:
-      - node
-      - pod
-      - container
-    metrics:
-      # The following metrics are optional and must be enabled manually as per:
-      # https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/documentation.md#optional-metrics
-      container.cpu.usage:
-        enabled: true
-      container.uptime:
-        enabled: true
-      k8s.container.cpu.node.utilization:
-        enabled: true
-      k8s.container.cpu_limit_utilization:
-        enabled: true
-      k8s.container.cpu_request_utilization:
-        enabled: true
-      k8s.container.memory.node.utilization:
-        enabled: true
-      k8s.container.memory_limit_utilization:
-        enabled: true
-      k8s.container.memory_request_utilization:
-        enabled: true
-      k8s.node.cpu.usage:
-        enabled: true
-      k8s.node.uptime:
-        enabled: true
-      k8s.pod.cpu.node.utilization:
-        enabled: true
-      k8s.pod.cpu.usage:
-        enabled: true
-      k8s.pod.cpu_limit_utilization:
-        enabled: true
-      k8s.pod.cpu_request_utilization:
-        enabled: true
-      k8s.pod.memory.node.utilization:
-        enabled: true
-      k8s.pod.memory_limit_utilization:
-        enabled: true
-      k8s.pod.memory_request_utilization:
-        enabled: true
-      k8s.pod.uptime:
-        enabled: true
-    extra_metadata_labels:
-      - container.id
+  {{- $endpoint := "" }}
+  {{- if .Values.node.kubeletstats.useNodeIp }}
+    {{- $endpoint = "\"${env:K8S_NODE_IP}:10250\"" }}
+  {{- else }}
+    {{- $endpoint = "\"${env:K8S_NODE_NAME}:10250\"" }}
+  {{- end }}
+  {{- include "observe.kubeletstats.receiver" (dict "Values" .Values "endpoint" $endpoint) | nindent 2 }}
   {{ end -}}
   {{- if .Values.node.containers.logs.enabled }}
   filelog:
@@ -165,6 +119,7 @@ processors:
 {{- include "config.processors.batch" . | nindent 2 }}
 {{- include "config.processors.attributes.k8sattributes" . | nindent 2 }}
 {{- include "config.processors.resource.observe_common" . | nindent 2 }}
+{{- include "config.processors.metricstransform.duplicate_k8s_cpu_metrics" . | nindent 2 }}
 
 {{- if .Values.agent.config.global.fleet.enabled }}
 {{- include "config.processors.resource_detection" . | nindent 2 }}
@@ -189,18 +144,6 @@ processors:
         action: insert
         value: kubeletstats_metrics
 
-  # convert new k8s metric names to the names our Kubernetes Explorer relies on
-  metricstransform/duplicate_k8s_cpu_metrics:
-    transforms:
-      - include: container.cpu.usage
-        action: insert
-        new_name: container.cpu.utilization
-      - include: k8s.pod.cpu.usage
-        action: insert
-        new_name: k8s.pod.cpu.utilization
-      - include: k8s.node.cpu.usage
-        action: insert
-        new_name: k8s.node.cpu.utilization
 
 # Create intermediate lists for pipeline arrays to then modify based on values.yaml
 {{- $logsExporters := (list "otlphttp/observe/base") -}}
diff --git a/charts/agent/templates/nodeless-cluster-role.yaml b/charts/agent/templates/nodeless-cluster-role.yaml
@@ -0,0 +1,56 @@
+{{- if .Values.nodeless.enabled }}
+{{- if .Values.nodeless.serviceAccounts }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: observe-agent-nodeless-cluster-role-{{ template "observe-agent.namespace" . }}
+  labels:
+    app.kubernetes.io/name: observe-agent-nodeless-cluster-role
+    app.kubernetes.io/instance: observe-agent
+rules:
+  - apiGroups: [""]
+    resources:
+      - nodes
+      - nodes/proxy
+      - namespaces
+      - pods
+      - configmaps
+    verbs: ["get", "list", "watch"]
+
+  - apiGroups: ["apps"]
+    resources:
+      - replicasets
+    verbs: ["get", "list", "watch"]
+---
+{{- range $namespace, $serviceAccounts := .Values.nodeless.serviceAccounts }}
+{{- if not (kindIs "slice" $serviceAccounts) }}
+{{- fail (printf "nodeless.serviceAccounts[%s] must be a list, but got: %v (type: %s)" $namespace $serviceAccounts (kindOf $serviceAccounts)) }}
+{{- end }}
+{{- if not $serviceAccounts }}
+{{- fail (printf "nodeless.serviceAccounts[%s] is empty. Please provide at least one service account or remove the namespace from the map." $namespace) }}
+{{- end }}
+{{- range $serviceAccounts }}
+{{- if not (kindIs "string" .) }}
+{{- fail (printf "nodeless.serviceAccounts[%s] contains a non-string value: %v. All service account names must be strings." $namespace .) }}
+{{- end }}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: observe-agent-nodeless-cluster-role-binding-{{ $namespace }}-{{ . }}
+  labels:
+    app.kubernetes.io/name: observe-agent-nodeless-cluster-role-binding
+    app.kubernetes.io/instance: observe-agent
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: observe-agent-nodeless-cluster-role-{{ template "observe-agent.namespace" $ }}
+subjects:
+  - kind: ServiceAccount
+    name: {{ . }}
+    namespace: {{ $namespace }}
+---
+{{- end }}
+{{- end }}
+{{- end }}
+{{- end }}
diff --git a/charts/agent/templates/nodeless-otelcollector.yaml b/charts/agent/templates/nodeless-otelcollector.yaml
@@ -0,0 +1,40 @@
+{{- if .Values.nodeless.enabled }}
+{{- if eq .Values.nodeless.hostingPlatform "fargate" }}
+apiVersion: opentelemetry.io/v1beta1
+kind: OpenTelemetryCollector
+metadata:
+  name: fargate-collector
+spec:
+  mode: sidecar
+  image: "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:latest"
+  env:
+    - name: K8S_NODE_NAME
+      valueFrom:
+        fieldRef:
+          fieldPath: spec.nodeName
+    - name: OBSERVE_CLUSTER_NAME
+      value: "{{ .Values.cluster.name }}"
+    - name: OBSERVE_CLUSTER_UID
+      valueFrom:
+        configMapKeyRef:
+          name: cluster-info
+          key: id
+    - name: OBSERVE_PROMETHEUS_ENDPOINT
+      value: "{{ .Values.observe.collectionEndpoint.value }}v1/prometheus"
+    - name: OBSERVE_AUTHORIZATION_HEADER
+      value: "Bearer {{ .Values.observe.token.value }}"
+  config:
+    {{- include "observe.sidecar.applyFargateSidecarConfig" . | nindent 4 }}
+  initContainers:
+    - name: kube-cluster-info
+      image: observeinc/kube-cluster-info:v0.11.5
+      imagePullPolicy: Always
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+{{- else }}
+{{- fail "Invalid nodeless.hostingPlatform, valid values are 'fargate', provided value is %s" .Values.nodeless.hostingPlatform }}
+{{- end }}
+{{- end }}
diff --git a/charts/agent/values.yaml b/charts/agent/values.yaml