Skip to content

Commit e563a8e

Browse files
fix: OB-41415 send sidecar metrics to observe directly
Customers have been requesting that we support EKS fargate hosted clusters. To do this, I add a new fargate mode (off my default) that will install an otel operator, which will use a sidecar container to query metrics from the pod it is attached to.
1 parent 5026226 commit e563a8e

File tree

10 files changed

+107
-603
lines changed

10 files changed

+107
-603
lines changed

charts/agent/Chart.yaml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,6 @@ dependencies:
4040
repository: https://open-telemetry.github.io/opentelemetry-helm-charts
4141
alias: gateway
4242
condition: gatewayDeployment.enabled
43-
- name: opentelemetry-operator
44-
version: 0.93.1
45-
repository: https://open-telemetry.github.io/opentelemetry-helm-charts
46-
alias: fargate-sidecar-injector
47-
condition: node.fargateMode
4843
maintainers:
4944
- name: Observe
5045

charts/agent/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -586,6 +586,7 @@ This service is a *single-instance deployment*. It's critical that this service
586586
| node.containers.metrics.enabled | bool | `true` | |
587587
| node.containers.metrics.interval | string | `"60s"` | |
588588
| node.enabled | bool | `true` | Enables the node-logs-metrics agent daemonset for collection of node logs and metrics. The nodes on which metrics and logs are collected can be configured via `affinity` in the `node-logs-metrics` section below. This should be set to false to disable the node-log-metrics daemonset when running in a serverless environment (ex: EKS Fargate). |
589+
| node.fargateMode | bool | `false` | Enables collection of metrics from EKS Fargate pods. Off by default |
589590
| node.forwarder.enabled | bool | `true` | |
590591
| node.forwarder.logs.enabled | bool | `true` | |
591592
| node.forwarder.metrics.convertCumulativeToDelta | bool | `false` | Converts cumulative metrics to delta metrics; all OTel metrics should be sent to Observe with temporality delta for the best experience. |
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
{{- define "observe.kubeletstats.receiver" -}}
2+
kubeletstats:
3+
collection_interval: {{.Values.node.containers.metrics.interval}}
4+
auth_type: 'serviceAccount'
5+
endpoint: {{ .endpoint }}
6+
node: '${env:K8S_NODE_NAME}'
7+
insecure_skip_verify: true
8+
k8s_api_config:
9+
auth_type: serviceAccount
10+
metric_groups:
11+
- node
12+
- pod
13+
- container
14+
metrics:
15+
# The following metrics are optional and must be enabled manually as per:
16+
# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/documentation.md#optional-metrics
17+
container.cpu.usage:
18+
enabled: true
19+
container.uptime:
20+
enabled: true
21+
k8s.container.cpu.node.utilization:
22+
enabled: true
23+
k8s.container.cpu_limit_utilization:
24+
enabled: true
25+
k8s.container.cpu_request_utilization:
26+
enabled: true
27+
k8s.container.memory.node.utilization:
28+
enabled: true
29+
k8s.container.memory_limit_utilization:
30+
enabled: true
31+
k8s.container.memory_request_utilization:
32+
enabled: true
33+
k8s.node.cpu.usage:
34+
enabled: true
35+
k8s.node.uptime:
36+
enabled: true
37+
k8s.pod.cpu.node.utilization:
38+
enabled: true
39+
k8s.pod.cpu.usage:
40+
enabled: true
41+
k8s.pod.cpu_limit_utilization:
42+
enabled: true
43+
k8s.pod.cpu_request_utilization:
44+
enabled: true
45+
k8s.pod.memory.node.utilization:
46+
enabled: true
47+
k8s.pod.memory_limit_utilization:
48+
enabled: true
49+
k8s.pod.memory_request_utilization:
50+
enabled: true
51+
k8s.pod.uptime:
52+
enabled: true
53+
extra_metadata_labels:
54+
- container.id
55+
{{- end }}
56+
Lines changed: 27 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,38 @@
11
{{- define "observe.sidecar.fargateSidecarMetrics.config" -}}
22

3-
{{- $kubeletstatsExporters := (list "otlphttp" "debug") -}}
4-
53
receivers:
6-
kubeletstats:
7-
collection_interval: {{.Values.node.containers.metrics.interval}}
8-
auth_type: 'serviceAccount'
9-
endpoint: https://kubernetes.default.svc/api/v1/nodes/${env:K8S_NODE_NAME}/proxy
10-
node: '${env:K8S_NODE_NAME}'
11-
insecure_skip_verify: true
12-
k8s_api_config:
13-
auth_type: serviceAccount
14-
metric_groups:
15-
- node
16-
- pod
17-
- container
18-
metrics:
19-
# The following metrics are optional and must be enabled manually as per:
20-
# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/documentation.md#optional-metrics
21-
container.cpu.usage:
22-
enabled: true
23-
container.uptime:
24-
enabled: true
25-
k8s.container.cpu.node.utilization:
26-
enabled: true
27-
k8s.container.cpu_limit_utilization:
28-
enabled: true
29-
k8s.container.cpu_request_utilization:
30-
enabled: true
31-
k8s.container.memory.node.utilization:
32-
enabled: true
33-
k8s.container.memory_limit_utilization:
34-
enabled: true
35-
k8s.container.memory_request_utilization:
36-
enabled: true
37-
k8s.node.cpu.usage:
38-
enabled: true
39-
k8s.node.uptime:
40-
enabled: true
41-
k8s.pod.cpu.node.utilization:
42-
enabled: true
43-
k8s.pod.cpu.usage:
44-
enabled: true
45-
k8s.pod.cpu_limit_utilization:
46-
enabled: true
47-
k8s.pod.cpu_request_utilization:
48-
enabled: true
49-
k8s.pod.memory.node.utilization:
50-
enabled: true
51-
k8s.pod.memory_limit_utilization:
52-
enabled: true
53-
k8s.pod.memory_request_utilization:
54-
enabled: true
55-
k8s.pod.uptime:
56-
enabled: true
57-
extra_metadata_labels:
58-
- container.id
4+
{{- include "observe.kubeletstats.receiver" (dict "Values" .Values "endpoint" "https://kubernetes.default.svc/api/v1/nodes/${env:K8S_NODE_NAME}/proxy") | nindent 2 }}
5+
6+
processors:
7+
8+
{{- include "config.processors.memory_limiter" . | nindent 2 }}
9+
{{- include "config.processors.batch" . | nindent 2 }}
10+
{{- include "config.processors.resource_detection.cloud" . | nindent 2 }}
11+
{{- include "config.processors.attributes.k8sattributes" . | nindent 2 }}
12+
{{- include "config.processors.resource.observe_common" . | nindent 2 }}
13+
{{- include "config.processors.deltatocumulative" . | nindent 2 }}
14+
{{- include "config.processors.attributes.add_empty_service_attributes" . | nindent 2 }}
15+
{{- include "config.processors.metricstransform.duplicate_k8s_cpu_metrics" . | nindent 2 }}
16+
{{- include "config.processors.attributes.sidecar_kubeletstats_metrics" . | nindent 2 }}
5917

6018
exporters:
61-
otlphttp:
62-
endpoint: http://observe-agent-forwarder.observe.svc:4318
63-
debug:
64-
verbosity: detailed
19+
{{- include "config.exporters.debug" . | nindent 2 }}
20+
{{- include "config.exporters.prometheusremotewrite" . | nindent 2 }}
21+
22+
{{ $kubeletstatsExporters := (list "prometheusremotewrite/observe") -}}
23+
24+
{{- if eq .Values.agent.config.global.debug.enabled true }}
25+
{{- $kubeletstatsExporters = concat $kubeletstatsExporters ( list "debug/override" ) | uniq }}
26+
{{- end }}
6527

6628
service:
6729
pipelines:
6830
{{- if .Values.node.containers.metrics.enabled }}
6931
metrics/kubeletstats:
70-
receivers: [kubeletstats] # should add processors back eventually
32+
receivers: [kubeletstats]
33+
processors: [memory_limiter, metricstransform/duplicate_k8s_cpu_metrics, k8sattributes, deltatocumulative/observe, batch, resourcedetection/cloud, resource/observe_common, attributes/debug_source_sidecar_kubeletstats_metrics]
7134
exporters: [{{ join ", " $kubeletstatsExporters }}]
72-
{{- end -}}
73-
{{- end }}
35+
{{- else }}
36+
{{- fail "node.containers.metrics.enabled must be true for Fargate sidecar - otherwise no telemetry will be collected" }}
37+
{{- end }}
38+
{{- end }}

charts/agent/templates/_node-logs-metrics-config.tpl

Lines changed: 7 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -70,59 +70,13 @@ receivers:
7070
network: null
7171
{{ end -}}
7272
{{- if .Values.node.containers.metrics.enabled }}
73-
kubeletstats:
74-
collection_interval: {{.Values.node.containers.metrics.interval}}
75-
auth_type: 'serviceAccount'
76-
endpoint: {{ if .Values.node.kubeletstats.useNodeIp }}"${env:K8S_NODE_IP}:10250"{{ else }}"${env:K8S_NODE_NAME}:10250"{{ end }}
77-
node: '${env:K8S_NODE_NAME}'
78-
insecure_skip_verify: true
79-
k8s_api_config:
80-
auth_type: serviceAccount
81-
metric_groups:
82-
- node
83-
- pod
84-
- container
85-
metrics:
86-
# The following metrics are optional and must be enabled manually as per:
87-
# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/kubeletstatsreceiver/documentation.md#optional-metrics
88-
container.cpu.usage:
89-
enabled: true
90-
container.uptime:
91-
enabled: true
92-
k8s.container.cpu.node.utilization:
93-
enabled: true
94-
k8s.container.cpu_limit_utilization:
95-
enabled: true
96-
k8s.container.cpu_request_utilization:
97-
enabled: true
98-
k8s.container.memory.node.utilization:
99-
enabled: true
100-
k8s.container.memory_limit_utilization:
101-
enabled: true
102-
k8s.container.memory_request_utilization:
103-
enabled: true
104-
k8s.node.cpu.usage:
105-
enabled: true
106-
k8s.node.uptime:
107-
enabled: true
108-
k8s.pod.cpu.node.utilization:
109-
enabled: true
110-
k8s.pod.cpu.usage:
111-
enabled: true
112-
k8s.pod.cpu_limit_utilization:
113-
enabled: true
114-
k8s.pod.cpu_request_utilization:
115-
enabled: true
116-
k8s.pod.memory.node.utilization:
117-
enabled: true
118-
k8s.pod.memory_limit_utilization:
119-
enabled: true
120-
k8s.pod.memory_request_utilization:
121-
enabled: true
122-
k8s.pod.uptime:
123-
enabled: true
124-
extra_metadata_labels:
125-
- container.id
73+
{{- $endpoint := "" }}
74+
{{- if .Values.node.kubeletstats.useNodeIp }}
75+
{{- $endpoint = "\"${env:K8S_NODE_IP}:10250\"" }}
76+
{{- else }}
77+
{{- $endpoint = "\"${env:K8S_NODE_NAME}:10250\"" }}
78+
{{- end }}
79+
{{- include "observe.kubeletstats.receiver" (dict "Values" .Values "endpoint" $endpoint) | nindent 2 }}
12680
{{ end -}}
12781
{{- if .Values.node.containers.logs.enabled }}
12882
filelog:

charts/agent/templates/kubeletstats-sidecar.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,27 @@
1-
{{- if .Values.node.fargateSidecar.enabled }}
1+
{{- if .Values.node.fargateMode }}
22
apiVersion: opentelemetry.io/v1beta1
33
kind: OpenTelemetryCollector
44
metadata:
55
name: fargate-sidecar-metrics
66
spec:
77
mode: sidecar
8+
image: "ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib:latest"
89
env:
910
- name: K8S_NODE_NAME
1011
valueFrom:
1112
fieldRef:
1213
fieldPath: spec.nodeName
14+
- name: OBSERVE_CLUSTER_NAME
15+
value: "{{ .Values.cluster.name }}"
16+
- name: OBSERVE_CLUSTER_UID
17+
valueFrom:
18+
configMapKeyRef:
19+
name: cluster-info
20+
key: id
21+
- name: OBSERVE_PROMETHEUS_ENDPOINT
22+
value: "{{ .Values.observe.collectionEndpoint.value }}v1/prometheus"
23+
- name: OBSERVE_AUTHORIZATION_HEADER
24+
value: "Bearer {{ .Values.observe.token.value }}"
1325
config:
1426
{{- include "observe.sidecar.applyFargateSidecarMetricsConfig" . | nindent 4 }}
1527
initContainers:

charts/agent/values.yaml

Lines changed: 3 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ node:
4747
# -- Enables the node-logs-metrics agent daemonset for collection of node logs and metrics.
4848
# The nodes on which metrics and logs are collected can be configured via `affinity` in the `node-logs-metrics` section below.
4949
# This should be set to false to disable the node-log-metrics daemonset when running in a serverless environment (ex: EKS Fargate).
50-
enabled: false
50+
enabled: true
51+
# -- Enables collection of metrics from EKS Fargate pods. Off by default
5152
fargateMode: false
5253
# collects host level metrics from node
5354
metrics:
@@ -101,7 +102,6 @@ node:
101102
# this resolves issues similar to https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/26481#issuecomment-1720797914 for `no such host` or `connection refused`.
102103
useNodeIp: false
103104
forwarder:
104-
mode: deployment
105105
enabled: true
106106
traces:
107107
enabled: true
@@ -254,10 +254,6 @@ agent:
254254
# exporters:
255255
# - otlphttp/extra
256256
# - otlphttp/observe/forward/trace
257-
258-
# -- Additional OTel collector config for fargate-sidecar-metrics custom resource
259-
fargateSidecarMetrics:
260-
# Put any OTel config overrides here.
261257

262258
# -- Additional OTel collector config for gateway deployment
263259
gateway:
@@ -1020,7 +1016,7 @@ monitor:
10201016
forwarder:
10211017
# -- The forwarder is run as a daemonset by default, but can be run as a deployment by setting mode to "deployment". Deployment mode
10221018
# must be used when running in a serverless environment (ex: EKS Fargate) where daemonsets are not supported.
1023-
mode: deployment
1019+
mode: daemonset
10241020

10251021
# -- The `replicaCount` is only used when `mode` is set to "deployment". It is ignored when `mode` is set to "daemonset".
10261022
# In deployment mode, this sets the number of replicas (ie the number of forwarder pods to run).
@@ -1330,27 +1326,3 @@ gateway:
13301326
- name: observe-agent-deployment-config
13311327
mountPath: /observe-agent-conf
13321328
# ----------------------------------------- #
1333-
image:
1334-
repository: "otel/opentelemetry-collector-k8s"
1335-
1336-
1337-
fargate-sidecar-injector:
1338-
1339-
# -- This is an otel operator that will inject a sidecar container into all pods in the cluster. This is only needed when running
1340-
# in a serverless environment (ex: EKS Fargate) where daemonsets are not supported.
1341-
1342-
replicaCount: 1
1343-
1344-
# ----------------------------------------- #
1345-
# Different for each deployment/daemonset #
1346-
nameOverride: "fargate-sidecar-injector"
1347-
# !!! IMPORTANT !!! This needs to have same value as namespaceOverride in cluster above
1348-
namespaceOverride: "observe"
1349-
# for now, use defaults for the rest of the values
1350-
# ----------------------------------------- #
1351-
manager:
1352-
collectorImage:
1353-
repository: observeinc/observe-agent
1354-
tag: 2.8.1
1355-
1356-

cluster_role.yaml

Lines changed: 0 additions & 31 deletions
This file was deleted.

forwarder-deploy.yaml

Lines changed: 0 additions & 14 deletions
This file was deleted.

0 commit comments

Comments
 (0)