Skip to content

Commit 093da22

Browse files
fix: OB-41415 send sidecar metrics to observe directly
Customers have been requesting that we support EKS fargate hosted clusters. To do this, I add a new fargate mode (off my default) that will install an otel operator, which will use a sidecar container to query metrics from the pod it is attached to.
1 parent 9a8b9dc commit 093da22

12 files changed

+111
-633
lines changed

charts/agent/Chart.lock

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,5 @@ dependencies:
2020
- name: opentelemetry-collector
2121
repository: https://open-telemetry.github.io/opentelemetry-helm-charts
2222
version: 0.130.2
23-
- name: opentelemetry-operator
24-
repository: https://open-telemetry.github.io/opentelemetry-helm-charts
25-
version: 0.93.1
26-
digest: sha256:b5548207946689a925841cca60cf59984043d00886dcb93b407c144630af909f
27-
generated: "2025-09-29T16:49:24.216039-07:00"
23+
digest: sha256:1463a6ca81d2cffd7c7cdf60a8bbc1f490ca721a50328f17a9b1f8d06a1dc6b1
24+
generated: "2025-08-26T11:41:10.527947-04:00"

charts/agent/Chart.yaml

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ apiVersion: v2
22
name: agent
33
description: Chart to install K8s collection stack based on Observe Agent
44
type: application
5-
version: 0.70.3
5+
version: 0.71.0
66
appVersion: "2.8.1"
77
dependencies:
88
- name: opentelemetry-collector
@@ -40,11 +40,6 @@ dependencies:
4040
repository: https://open-telemetry.github.io/opentelemetry-helm-charts
4141
alias: gateway
4242
condition: gatewayDeployment.enabled
43-
- name: opentelemetry-operator
44-
version: 0.93.1
45-
repository: https://open-telemetry.github.io/opentelemetry-helm-charts
46-
alias: fargate-sidecar-injector
47-
condition: node.fargateMode
4843
maintainers:
4944
- name: Observe
5045

charts/agent/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# agent
22

3-
![Version: 0.70.3](https://img.shields.io/badge/Version-0.70.3-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.8.1](https://img.shields.io/badge/AppVersion-2.8.1-informational?style=flat-square)
3+
![Version: 0.71.0](https://img.shields.io/badge/Version-0.71.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 2.8.1](https://img.shields.io/badge/AppVersion-2.8.1-informational?style=flat-square)
44

55
Chart to install K8s collection stack based on Observe Agent
66

@@ -555,6 +555,7 @@ This service is a *single-instance deployment*. It's critical that this service
555555
| node.containers.metrics.enabled | bool | `true` | |
556556
| node.containers.metrics.interval | string | `"60s"` | |
557557
| node.enabled | bool | `true` | Enables the node-logs-metrics agent daemonset for collection of node logs and metrics. The nodes on which metrics and logs are collected can be configured via `affinity` in the `node-logs-metrics` section below. This should be set to false to disable the node-log-metrics daemonset when running in a serverless environment (ex: EKS Fargate). |
558+
| node.fargateMode | bool | `false` | Enables collection of metrics from EKS Fargate pods. Off by default |
558559
| node.forwarder.enabled | bool | `true` | |
559560
| node.forwarder.logs.enabled | bool | `true` | |
560561
| node.forwarder.metrics.enabled | bool | `true` | |

charts/agent/templates/_config-receivers.tpl

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -137,26 +137,3 @@ prometheus/cadvisor:
137137
replacement: /api/v1/nodes/$$1/proxy/metrics/cadvisor
138138
{{ end }}
139139
{{ end }}
140-
141-
{{- define "config.receivers.prometheus.kubeletstats" -}}
142-
prometheus/kubeletstats:
143-
config:
144-
scrape_configs:
145-
- job_name: 'kubernetes-nodes-kubeletstats'
146-
scheme: https
147-
tls_config:
148-
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
149-
insecure_skip_verify: true
150-
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
151-
152-
kubernetes_sd_configs:
153-
- role: node
154-
155-
relabel_configs:
156-
- target_label: __address__
157-
replacement: kubernetes.default.svc:443
158-
- source_labels: [__meta_kubernetes_node_name]
159-
regex: (.+)
160-
target_label: __metrics_path__
161-
replacement: /api/v1/nodes/$$1/proxy/stats/summary
162-
{{- end -}}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{{- define "observe.kubeletstats.receiver" -}}
2+
kubeletstats:
3+
collection_interval: {{.Values.node.containers.metrics.interval}}
4+
auth_type: 'serviceAccount'
5+
endpoint: {{ .endpoint }}
6+
node: '${env:K8S_NODE_NAME}'
7+
insecure_skip_verify: true
8+
k8s_api_config:
9+
auth_type: serviceAccount
10+
metric_groups:
11+
- node
12+
- pod
13+
- container
14+
metrics:
15+
# The following metrics are optional and must be enabled manually as per:
16+
# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/documentation.md#optional-metrics
17+
container.cpu.usage:
18+
enabled: true
19+
container.uptime:
20+
enabled: true
21+
k8s.container.cpu.node.utilization:
22+
enabled: true
23+
k8s.container.cpu_limit_utilization:
24+
enabled: true
25+
k8s.container.cpu_request_utilization:
26+
enabled: true
27+
k8s.container.memory.node.utilization:
28+
enabled: true
29+
k8s.container.memory_limit_utilization:
30+
enabled: true
31+
k8s.container.memory_request_utilization:
32+
enabled: true
33+
k8s.node.cpu.usage:
34+
enabled: true
35+
k8s.node.uptime:
36+
enabled: true
37+
k8s.pod.cpu.node.utilization:
38+
enabled: true
39+
k8s.pod.cpu.usage:
40+
enabled: true
41+
k8s.pod.cpu_limit_utilization:
42+
enabled: true
43+
k8s.pod.cpu_request_utilization:
44+
enabled: true
45+
k8s.pod.memory.node.utilization:
46+
enabled: true
47+
k8s.pod.memory_limit_utilization:
48+
enabled: true
49+
k8s.pod.memory_request_utilization:
50+
enabled: true
51+
k8s.pod.uptime:
52+
enabled: true
53+
extra_metadata_labels:
54+
- container.id
55+
{{- end }}
56+
Lines changed: 27 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,38 @@
11
{{- define "observe.sidecar.fargateSidecarMetrics.config" -}}
22

3-
{{- $kubeletstatsExporters := (list "otlphttp" "debug") -}}
4-
53
receivers:
6-
kubeletstats:
7-
collection_interval: {{.Values.node.containers.metrics.interval}}
8-
auth_type: 'serviceAccount'
9-
endpoint: https://kubernetes.default.svc/api/v1/nodes/${env:K8S_NODE_NAME}/proxy
10-
node: '${env:K8S_NODE_NAME}'
11-
insecure_skip_verify: true
12-
k8s_api_config:
13-
auth_type: serviceAccount
14-
metric_groups:
15-
- node
16-
- pod
17-
- container
18-
metrics:
19-
# The following metrics are optional and must be enabled manually as per:
20-
# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/documentation.md#optional-metrics
21-
container.cpu.usage:
22-
enabled: true
23-
container.uptime:
24-
enabled: true
25-
k8s.container.cpu.node.utilization:
26-
enabled: true
27-
k8s.container.cpu_limit_utilization:
28-
enabled: true
29-
k8s.container.cpu_request_utilization:
30-
enabled: true
31-
k8s.container.memory.node.utilization:
32-
enabled: true
33-
k8s.container.memory_limit_utilization:
34-
enabled: true
35-
k8s.container.memory_request_utilization:
36-
enabled: true
37-
k8s.node.cpu.usage:
38-
enabled: true
39-
k8s.node.uptime:
40-
enabled: true
41-
k8s.pod.cpu.node.utilization:
42-
enabled: true
43-
k8s.pod.cpu.usage:
44-
enabled: true
45-
k8s.pod.cpu_limit_utilization:
46-
enabled: true
47-
k8s.pod.cpu_request_utilization:
48-
enabled: true
49-
k8s.pod.memory.node.utilization:
50-
enabled: true
51-
k8s.pod.memory_limit_utilization:
52-
enabled: true
53-
k8s.pod.memory_request_utilization:
54-
enabled: true
55-
k8s.pod.uptime:
56-
enabled: true
57-
extra_metadata_labels:
58-
- container.id
4+
{{- include "observe.kubeletstats.receiver" (dict "Values" .Values "endpoint" "https://kubernetes.default.svc/api/v1/nodes/${env:K8S_NODE_NAME}/proxy") | nindent 2 }}
5+
6+
processors:
7+
8+
{{- include "config.processors.memory_limiter" . | nindent 2 }}
9+
{{- include "config.processors.batch" . | nindent 2 }}
10+
{{- include "config.processors.resource_detection.cloud" . | nindent 2 }}
11+
{{- include "config.processors.attributes.k8sattributes" . | nindent 2 }}
12+
{{- include "config.processors.resource.observe_common" . | nindent 2 }}
13+
{{- include "config.processors.deltatocumulative" . | nindent 2 }}
14+
{{- include "config.processors.attributes.add_empty_service_attributes" . | nindent 2 }}
15+
{{- include "config.processors.metricstransform.duplicate_k8s_cpu_metrics" . | nindent 2 }}
16+
{{- include "config.processors.attributes.sidecar_kubeletstats_metrics" . | nindent 2 }}
5917

6018
exporters:
61-
otlphttp:
62-
endpoint: http://observe-agent-forwarder.observe.svc:4318
63-
debug:
64-
verbosity: detailed
19+
{{- include "config.exporters.debug" . | nindent 2 }}
20+
{{- include "config.exporters.prometheusremotewrite" . | nindent 2 }}
21+
22+
{{ $kubeletstatsExporters := (list "prometheusremotewrite/observe") -}}
23+
24+
{{- if eq .Values.agent.config.global.debug.enabled true }}
25+
{{- $kubeletstatsExporters = concat $kubeletstatsExporters ( list "debug/override" ) | uniq }}
26+
{{- end }}
6527

6628
service:
6729
pipelines:
6830
{{- if .Values.node.containers.metrics.enabled }}
6931
metrics/kubeletstats:
70-
receivers: [kubeletstats] # should add processors back eventually
32+
receivers: [kubeletstats]
33+
processors: [memory_limiter, metricstransform/duplicate_k8s_cpu_metrics, k8sattributes, deltatocumulative/observe, batch, resourcedetection/cloud, resource/observe_common, attributes/debug_source_sidecar_kubeletstats_metrics]
7134
exporters: [{{ join ", " $kubeletstatsExporters }}]
72-
{{- end -}}
73-
{{- end }}
35+
{{- else }}
36+
{{- fail "node.containers.metrics.enabled must be true for Fargate sidecar - otherwise no telemetry will be collected" }}
37+
{{- end }}
38+
{{- end }}

charts/agent/templates/_node-logs-metrics-config.tpl

Lines changed: 7 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -66,59 +66,13 @@ receivers:
6666
network: null
6767
{{ end -}}
6868
{{- if .Values.node.containers.metrics.enabled }}
69-
kubeletstats:
70-
collection_interval: {{.Values.node.containers.metrics.interval}}
71-
auth_type: 'serviceAccount'
72-
endpoint: {{ if .Values.node.kubeletstats.useNodeIp }}"${env:K8S_NODE_IP}:10250"{{ else }}"${env:K8S_NODE_NAME}:10250"{{ end }}
73-
node: '${env:K8S_NODE_NAME}'
74-
insecure_skip_verify: true
75-
k8s_api_config:
76-
auth_type: serviceAccount
77-
metric_groups:
78-
- node
79-
- pod
80-
- container
81-
metrics:
82-
# The following metrics are optional and must be enabled manually as per:
83-
# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/documentation.md#optional-metrics
84-
container.cpu.usage:
85-
enabled: true
86-
container.uptime:
87-
enabled: true
88-
k8s.container.cpu.node.utilization:
89-
enabled: true
90-
k8s.container.cpu_limit_utilization:
91-
enabled: true
92-
k8s.container.cpu_request_utilization:
93-
enabled: true
94-
k8s.container.memory.node.utilization:
95-
enabled: true
96-
k8s.container.memory_limit_utilization:
97-
enabled: true
98-
k8s.container.memory_request_utilization:
99-
enabled: true
100-
k8s.node.cpu.usage:
101-
enabled: true
102-
k8s.node.uptime:
103-
enabled: true
104-
k8s.pod.cpu.node.utilization:
105-
enabled: true
106-
k8s.pod.cpu.usage:
107-
enabled: true
108-
k8s.pod.cpu_limit_utilization:
109-
enabled: true
110-
k8s.pod.cpu_request_utilization:
111-
enabled: true
112-
k8s.pod.memory.node.utilization:
113-
enabled: true
114-
k8s.pod.memory_limit_utilization:
115-
enabled: true
116-
k8s.pod.memory_request_utilization:
117-
enabled: true
118-
k8s.pod.uptime:
119-
enabled: true
120-
extra_metadata_labels:
121-
- container.id
69+
{{- $endpoint := "" }}
70+
{{- if .Values.node.kubeletstats.useNodeIp }}
71+
{{- $endpoint = "\"${env:K8S_NODE_IP}:10250\"" }}
72+
{{- else }}
73+
{{- $endpoint = "\"${env:K8S_NODE_NAME}:10250\"" }}
74+
{{- end }}
75+
{{- include "observe.kubeletstats.receiver" (dict "Values" .Values "endpoint" $endpoint) | nindent 2 }}
12276
{{ end -}}
12377
{{- if .Values.node.containers.logs.enabled }}
12478
filelog:

charts/agent/templates/kubeletstats-sidecar.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,27 @@
1-
{{- if .Values.node.fargateSidecar.enabled }}
1+
{{- if .Values.node.fargateMode }}
22
apiVersion: opentelemetry.io/v1beta1
33
kind: OpenTelemetryCollector
44
metadata:
55
name: fargate-sidecar-metrics
66
spec:
77
mode: sidecar
8+
image: "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:latest"
89
env:
910
- name: K8S_NODE_NAME
1011
valueFrom:
1112
fieldRef:
1213
fieldPath: spec.nodeName
14+
- name: OBSERVE_CLUSTER_NAME
15+
value: "{{ .Values.cluster.name }}"
16+
- name: OBSERVE_CLUSTER_UID
17+
valueFrom:
18+
configMapKeyRef:
19+
name: cluster-info
20+
key: id
21+
- name: OBSERVE_PROMETHEUS_ENDPOINT
22+
value: "{{ .Values.observe.collectionEndpoint.value }}v1/prometheus"
23+
- name: OBSERVE_AUTHORIZATION_HEADER
24+
value: "Bearer {{ .Values.observe.token.value }}"
1325
config:
1426
{{- include "observe.sidecar.applyFargateSidecarMetricsConfig" . | nindent 4 }}
1527
initContainers:

charts/agent/values.yaml

Lines changed: 3 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ node:
4747
# -- Enables the node-logs-metrics agent daemonset for collection of node logs and metrics.
4848
# The nodes on which metrics and logs are collected can be configured via `affinity` in the `node-logs-metrics` section below.
4949
# This should be set to false to disable the node-log-metrics daemonset when running in a serverless environment (ex: EKS Fargate).
50-
enabled: false
50+
enabled: true
51+
# -- Enables collection of metrics from EKS Fargate pods. Off by default
5152
fargateMode: false
5253
# collects host level metrics from node
5354
metrics:
@@ -101,7 +102,6 @@ node:
101102
# this resolves issues similar to https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/26481#issuecomment-1720797914 for `no such host` or `connection refused`.
102103
useNodeIp: false
103104
forwarder:
104-
mode: deployment
105105
enabled: true
106106
traces:
107107
enabled: true
@@ -231,10 +231,6 @@ agent:
231231
# exporters:
232232
# - otlphttp/extra
233233
# - otlphttp/observe/forward/trace
234-
235-
# -- Additional OTel collector config for fargate-sidecar-metrics custom resource
236-
fargateSidecarMetrics:
237-
# Put any OTel config overrides here.
238234

239235
# -- Additional OTel collector config for gateway deployment
240236
gateway:
@@ -957,7 +953,7 @@ monitor:
957953
forwarder:
958954
# -- The forwarder is run as a daemonset by default, but can be run as a deployment by setting mode to "deployment". Deployment mode
959955
# must be used when running in a serverless environment (ex: EKS Fargate) where daemonsets are not supported.
960-
mode: deployment
956+
mode: daemonset
961957

962958
# -- The `replicaCount` is only used when `mode` is set to "deployment". It is ignored when `mode` is set to "daemonset".
963959
# In deployment mode, this sets the number of replicas (ie the number of forwarder pods to run).
@@ -1251,27 +1247,3 @@ gateway:
12511247
- name: observe-agent-deployment-config
12521248
mountPath: /observe-agent-conf
12531249
# ----------------------------------------- #
1254-
image:
1255-
repository: "otel/opentelemetry-collector-k8s"
1256-
1257-
1258-
fargate-sidecar-injector:
1259-
1260-
# -- This is an otel operator that will inject a sidecar container into all pods in the cluster. This is only needed when running
1261-
# in a serverless environment (ex: EKS Fargate) where daemonsets are not supported.
1262-
1263-
replicaCount: 1
1264-
1265-
# ----------------------------------------- #
1266-
# Different for each deployment/daemonset #
1267-
nameOverride: "fargate-sidecar-injector"
1268-
# !!! IMPORTANT !!! This needs to have same value as namespaceOverride in cluster above
1269-
namespaceOverride: "observe"
1270-
# for now, use defaults for the rest of the values
1271-
# ----------------------------------------- #
1272-
manager:
1273-
collectorImage:
1274-
repository: observeinc/observe-agent
1275-
tag: 2.8.1
1276-
1277-

0 commit comments

Comments
 (0)