diff --git a/.chloggen/eks-batch-nodes.yaml b/.chloggen/eks-batch-nodes.yaml new file mode 100644 index 0000000000..1fa5c454f2 --- /dev/null +++ b/.chloggen/eks-batch-nodes.yaml @@ -0,0 +1,12 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: enhancement +# The name of the component, or a single word describing the area of concern, (e.g. agent, clusterReceiver, gateway, operator, chart, other) +component: agent +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Document how to run the Collector agent daemonset on AWS Batch-managed EKS nodes +# One or more tracking issues related to the change +issues: [2398] +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: diff --git a/docs/advanced-configuration.md b/docs/advanced-configuration.md index 615b88147d..d5d4505e7f 100644 --- a/docs/advanced-configuration.md +++ b/docs/advanced-configuration.md @@ -348,6 +348,52 @@ aws eks create-pod-identity-association \ --region $AWS_REGION ```` +## EKS: Running on AWS Batch nodes + +AWS Batch on EKS taints its managed nodes with `batch.amazonaws.com/batch-node` +to prevent general workloads from scheduling there. The Collector agent daemonset +must tolerate this taint to collect logs and metrics from those nodes. +This configuration is for EKS clusters that run the agent daemonset; it does not +apply to `eks/fargate`. + +The top-level [`tolerations`](../helm-charts/splunk-otel-collector/values.yaml) +value controls the agent daemonset tolerations only (not the cluster receiver, +gateway, or operator; each has its own `tolerations` sub-key). Because Helm +replaces list values entirely on upgrade, your custom `tolerations` list must +include **both** the chart's default tolerations and the new Batch entries: + +```yaml +# Set distribution to match your cluster type (eks or eks/auto-mode). +distribution: eks +cloudProvider: aws + +tolerations: + # Chart defaults - keep these to continue collecting from control-plane and + # infra nodes. + - key: node-role.kubernetes.io/master + effect: NoSchedule + operator: Exists + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule + operator: Exists + - key: kubernetes.io/system-node + effect: NoSchedule + operator: Exists + - key: node-role.kubernetes.io/infra + effect: NoSchedule + operator: Exists + # AWS Batch node taint - allows scheduling on Batch-managed nodes. + - key: batch.amazonaws.com/batch-node + operator: Exists + effect: NoSchedule + - key: batch.amazonaws.com/batch-node + operator: Exists + effect: NoExecute +``` + +See the [eks-batch-nodes example](../examples/eks-batch-nodes/README.md) for a +ready-to-use values file. + ## EKS Fargate support If you want to run the Splunk OpenTelemetry Collector in [Amazon Elastic Kubernetes Service diff --git a/examples/eks-batch-nodes/README.md b/examples/eks-batch-nodes/README.md new file mode 100644 index 0000000000..776518601f --- /dev/null +++ b/examples/eks-batch-nodes/README.md @@ -0,0 +1,32 @@ +# Example: Collector on EKS with AWS Batch nodes + +This example shows how to configure the Splunk OpenTelemetry Collector to +collect logs and metrics from EKS nodes managed by +[AWS Batch on EKS](https://docs.aws.amazon.com/batch/latest/userguide/jobs_eks.html). + +## Background + +AWS Batch taints its managed EKS nodes with `batch.amazonaws.com/batch-node` to +prevent general workloads from scheduling there. The Collector agent daemonset +must explicitly tolerate this taint to run on those nodes. + +The chart's top-level `tolerations` value is a list. Helm replaces lists +entirely during upgrades, so your values file must include **both** the chart's +built-in default tolerations and the new AWS Batch entries. Omitting the +defaults would stop the agent from scheduling on control-plane and infra nodes. + +## Usage + +```bash +helm install my-splunk-otel-collector \ + --values eks-batch-nodes-values.norender.yaml \ + splunk-otel-collector-chart/splunk-otel-collector +``` + +Replace the `CHANGEME` placeholders before running. + +## See also + +- [Advanced configuration - EKS: Running on AWS Batch nodes](../../docs/advanced-configuration.md#eks-running-on-aws-batch-nodes) +- [Run a DaemonSet on AWS Batch managed nodes](https://docs.aws.amazon.com/batch/latest/userguide/daemonset-on-batch-eks-nodes.html) +- [Kubernetes taints and tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) diff --git a/examples/eks-batch-nodes/eks-batch-nodes-values.yaml b/examples/eks-batch-nodes/eks-batch-nodes-values.yaml new file mode 100644 index 0000000000..708de081d2 --- /dev/null +++ b/examples/eks-batch-nodes/eks-batch-nodes-values.yaml @@ -0,0 +1,33 @@ +splunkObservability: + realm: CHANGEME + accessToken: CHANGEME + +# The cluster name is auto-discovered for eks and eks/auto-mode. +# Set to eks/auto-mode if using EKS Auto Mode. +distribution: eks +cloudProvider: aws + +# Helm replaces list values entirely on upgrade, so include all tolerations you +# need, both the chart defaults below and the AWS Batch additions at the end. +tolerations: + # Chart defaults - keep these to collect from control-plane and infra nodes. + - key: node-role.kubernetes.io/master + effect: NoSchedule + operator: Exists + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule + operator: Exists + - key: kubernetes.io/system-node + effect: NoSchedule + operator: Exists + - key: node-role.kubernetes.io/infra + effect: NoSchedule + operator: Exists + # AWS Batch node taint - allows the agent daemonset to schedule on + # Batch-managed nodes. + - key: batch.amazonaws.com/batch-node + operator: Exists + effect: NoSchedule + - key: batch.amazonaws.com/batch-node + operator: Exists + effect: NoExecute diff --git a/examples/eks-batch-nodes/rendered_manifests/clusterRole.yaml b/examples/eks-batch-nodes/rendered_manifests/clusterRole.yaml new file mode 100644 index 0000000000..ef3d36cbdc --- /dev/null +++ b/examples/eks-batch-nodes/rendered_manifests/clusterRole.yaml @@ -0,0 +1,99 @@ +--- +# Source: splunk-otel-collector/templates/clusterRole.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: default-splunk-otel-collector + labels: + app.kubernetes.io/name: splunk-otel-collector + helm.sh/chart: splunk-otel-collector-0.150.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: default + app.kubernetes.io/version: "0.150.0" + app: splunk-otel-collector + chart: splunk-otel-collector-0.150.0 + release: default +rules: +- apiGroups: + - "" + resources: + - events + - namespaces + - namespaces/status + - nodes + - nodes/spec + - nodes/stats + - nodes/proxy + - pods + - pods/status + - persistentvolumeclaims + - persistentvolumes + - replicationcontrollers + - replicationcontrollers/status + - resourcequotas + - services + verbs: + - get + - list + - watch +- apiGroups: + - apps + resources: + - daemonsets + - deployments + - replicasets + - statefulsets + verbs: + - get + - list + - watch +- apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + verbs: + - get + - list + - watch +- apiGroups: + - batch + resources: + - jobs + - cronjobs + verbs: + - get + - list + - watch +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - get + - list + - watch +- nonResourceURLs: + - /metrics + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get + resourceNames: + - aws-auth +- apiGroups: + - events.k8s.io + resources: + - events + - namespaces + verbs: + - get + - list + - watch diff --git a/examples/eks-batch-nodes/rendered_manifests/clusterRoleBinding.yaml b/examples/eks-batch-nodes/rendered_manifests/clusterRoleBinding.yaml new file mode 100644 index 0000000000..a0b3f00ccd --- /dev/null +++ b/examples/eks-batch-nodes/rendered_manifests/clusterRoleBinding.yaml @@ -0,0 +1,23 @@ +--- +# Source: splunk-otel-collector/templates/clusterRoleBinding.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: default-splunk-otel-collector + labels: + app.kubernetes.io/name: splunk-otel-collector + helm.sh/chart: splunk-otel-collector-0.150.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: default + app.kubernetes.io/version: "0.150.0" + app: splunk-otel-collector + chart: splunk-otel-collector-0.150.0 + release: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: default-splunk-otel-collector +subjects: +- kind: ServiceAccount + name: default-splunk-otel-collector + namespace: default diff --git a/examples/eks-batch-nodes/rendered_manifests/configmap-agent.yaml b/examples/eks-batch-nodes/rendered_manifests/configmap-agent.yaml new file mode 100644 index 0000000000..cafa248e3e --- /dev/null +++ b/examples/eks-batch-nodes/rendered_manifests/configmap-agent.yaml @@ -0,0 +1,433 @@ +--- +# Source: splunk-otel-collector/templates/configmap-agent.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: default-splunk-otel-collector-otel-agent + namespace: default + labels: + app.kubernetes.io/name: splunk-otel-collector + helm.sh/chart: splunk-otel-collector-0.150.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: default + app.kubernetes.io/version: "0.150.0" + app: splunk-otel-collector + chart: splunk-otel-collector-0.150.0 + release: default +data: + relay: | + exporters: + otlp_http: + auth: + authenticator: headers_setter + metrics_endpoint: https://ingest.CHANGEME.observability.splunkcloud.com/v2/datapoint/otlp + traces_endpoint: https://ingest.CHANGEME.observability.splunkcloud.com/v2/trace/otlp + otlp_http/entities: + auth: + authenticator: headers_setter + logs_endpoint: https://ingest.CHANGEME.observability.splunkcloud.com/v3/event + signalfx: + access_token: ${SPLUNK_OBSERVABILITY_ACCESS_TOKEN} + api_url: https://api.CHANGEME.observability.splunkcloud.com + correlation: null + ingest_url: https://ingest.CHANGEME.observability.splunkcloud.com + root_path: /hostfs + sync_host_metadata: true + signalfx/histograms: + access_token: ${SPLUNK_OBSERVABILITY_ACCESS_TOKEN} + api_url: https://api.CHANGEME.observability.splunkcloud.com + ingest_url: https://ingest.CHANGEME.observability.splunkcloud.com + send_otlp_histograms: true + extensions: + headers_setter: + headers: + - action: upsert + default_value: ${SPLUNK_OBSERVABILITY_ACCESS_TOKEN} + from_context: X-SF-TOKEN + key: X-SF-TOKEN + health_check: + endpoint: 0.0.0.0:13133 + k8s_observer: + auth_type: serviceAccount + node: ${K8S_NODE_NAME} + zpages: null + processors: + batch: + metadata_keys: + - X-SF-Token + filter/logs: + logs: + exclude: + match_type: strict + resource_attributes: + - key: splunk.com/exclude + value: "true" + k8s_attributes: + extract: + annotations: + - from: pod + key: splunk.com/sourcetype + - from: namespace + key: splunk.com/exclude + tag_name: splunk.com/exclude + - from: pod + key: splunk.com/exclude + tag_name: splunk.com/exclude + - from: namespace + key: splunk.com/index + tag_name: com.splunk.index + - from: pod + key: splunk.com/index + tag_name: com.splunk.index + labels: + - key: app + metadata: + - k8s.namespace.name + - k8s.node.name + - k8s.pod.name + - k8s.pod.uid + - container.id + - container.image.name + - container.image.tag + filter: + node_from_env_var: K8S_NODE_NAME + pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.uid + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: ip + - sources: + - from: connection + memory_limiter: + check_interval: 2s + limit_mib: ${SPLUNK_MEMORY_LIMIT_MIB} + resource: + attributes: + - action: insert + key: k8s.node.name + value: ${K8S_NODE_NAME} + resource/add_agent_k8s: + attributes: + - action: insert + key: k8s.pod.name + value: ${K8S_POD_NAME} + - action: insert + key: k8s.pod.uid + value: ${K8S_POD_UID} + - action: insert + key: k8s.namespace.name + value: ${K8S_NAMESPACE} + resource/add_mode: + attributes: + - action: insert + key: otelcol.service.mode + value: agent + resource/logs: + attributes: + - action: upsert + from_attribute: k8s.pod.annotations.splunk.com/sourcetype + key: com.splunk.sourcetype + - action: delete + key: k8s.pod.annotations.splunk.com/sourcetype + - action: delete + key: splunk.com/exclude + resourcedetection: + detectors: + - env + - eks + - system + eks: + node_from_env_var: K8S_NODE_NAME + resource_attributes: + cloud.account.id: + enabled: true + cloud.availability_zone: + enabled: true + cloud.region: + enabled: true + host.id: + enabled: true + host.image.id: + enabled: true + host.name: + enabled: true + host.type: + enabled: true + k8s.cluster.name: + enabled: true + override: true + timeout: 15s + resourcedetection/k8s_cluster_name: + detectors: + - eks + eks: + node_from_env_var: K8S_NODE_NAME + resource_attributes: + cloud.platform: + enabled: false + cloud.provider: + enabled: false + k8s.cluster.name: + enabled: true + override: true + timeout: 15s + receivers: + hostmetrics: + collection_interval: 10s + root_path: /hostfs + scrapers: + cpu: null + disk: null + filesystem: + include_mount_points: + match_type: strict + mount_points: + - / + load: null + memory: null + network: null + paging: null + processes: null + jaeger: + protocols: + grpc: + endpoint: 0.0.0.0:14250 + thrift_http: + endpoint: 0.0.0.0:14268 + kubeletstats: + auth_type: serviceAccount + collection_interval: 10s + endpoint: ${K8S_NODE_IP}:10250 + extra_metadata_labels: + - container.id + metric_groups: + - container + - pod + - node + metrics: + k8s.node.cpu.usage: + enabled: false + k8s.pod.cpu.usage: + enabled: false + nop: null + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + prometheus/agent: + config: + scrape_configs: + - job_name: otel-agent + metric_relabel_configs: + - action: drop + regex: promhttp_metric_handler_errors.* + source_labels: + - __name__ + - action: drop + regex: otelcol_processor_batch_.* + source_labels: + - __name__ + - action: drop + regex: .*otelcol\.k8s\.pod\.association + source_labels: + - __name__ + scrape_interval: 10s + static_configs: + - targets: + - localhost:8889 + receiver_creator: + receivers: + prometheus/coredns: + config: + config: + scrape_configs: + - job_name: coredns + metric_relabel_configs: + - action: keep + regex: (coredns_dns_request_duration_seconds|coredns_cache_misses_total|coredns_cache_hits_total|coredns_cache_entries|coredns_dns_responses_total|coredns_dns_requests_total|rest_client_requests_total|rest_client_request_duration_seconds)(?:_sum|_count|_bucket)? + source_labels: + - __name__ + scrape_interval: 10s + static_configs: + - targets: + - '`endpoint`:9153' + rule: type == "pod" && labels["k8s-app"] == "kube-dns" + prometheus/kube-controller-manager: + config: + config: + scrape_configs: + - authorization: + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + type: Bearer + job_name: kube-controller-manager + metric_relabel_configs: + - action: keep + regex: (workqueue_longest_running_processor_seconds|workqueue_unfinished_work_seconds|workqueue_depth|workqueue_retries_total|workqueue_queue_duration_seconds)(?:_sum|_count|_bucket)? + source_labels: + - __name__ + scheme: https + scrape_interval: 10s + static_configs: + - targets: + - '`endpoint`:10257' + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + rule: type == "pod" && (labels["k8s-app"] == "kube-controller-manager" || + labels["component"] == "kube-controller-manager") + prometheus/kubernetes-apiserver: + config: + config: + scrape_configs: + - authorization: + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + type: Bearer + job_name: kubernetes-apiserver + metric_relabel_configs: + - action: keep + regex: (apiserver_longrunning_requests|apiserver_request_duration_seconds|apiserver_storage_objects|apiserver_response_sizes|apiserver_request_total|kubernetes_build_info|rest_client_requests_total|rest_client_request_duration_seconds|apiserver_storage_size_bytes|apiserver_requested_deprecated_apis)(?:_sum|_count|_bucket)? + source_labels: + - __name__ + scheme: https + scrape_interval: 10s + static_configs: + - targets: + - '`endpoint`' + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + rule: type == "port" && port == 443 && (pod.labels["k8s-app"] == "kube-apiserver" + || pod.labels["component"] == "kube-apiserver") + prometheus/kubernetes-proxy: + config: + config: + scrape_configs: + - job_name: kubernetes-proxy + metric_relabel_configs: + - action: keep + regex: (kubeproxy_sync_proxy_rules_iptables_restore_failures_total|kubeproxy_sync_proxy_rules_service_changes_total|kubeproxy_sync_proxy_rules_service_changes_pending|kubeproxy_sync_proxy_rules_duration_seconds|kubeproxy_network_programming_duration_seconds)(?:_sum|_count|_bucket)? + source_labels: + - __name__ + - action: drop + regex: kubeproxy_network_programming_duration_seconds_bucket;([1-3][1-46-9]|[4-9][1-9]|100|110|115|270)\.0 + source_labels: + - __name__ + - le + scrape_interval: 10s + static_configs: + - targets: + - '`endpoint`:10249' + rule: type == "pod" && labels["k8s-app"] == "kube-proxy" + prometheus/kubernetes-scheduler: + config: + config: + scrape_configs: + - authorization: + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + type: Bearer + job_name: kubernetes-scheduler + metric_relabel_configs: + - action: keep + regex: (rest_client_request_duration_seconds|rest_client_requests_total|scheduler_pending_pods|scheduler_schedule_attempts_total|scheduler_queue_incoming_pods_total|scheduler_preemption_attempts_total|scheduler_scheduling_algorithm_duration_seconds|scheduler_pod_scheduling_sli_duration_seconds)(?:_sum|_count|_bucket)? + source_labels: + - __name__ + scheme: https + scrape_interval: 10s + static_configs: + - targets: + - '`endpoint`:10259' + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + rule: type == "pod" && (labels["k8s-app"] == "kube-scheduler" || labels["component"] + == "kube-scheduler") + watch_observers: + - k8s_observer + zipkin: + endpoint: 0.0.0.0:9411 + service: + extensions: + - health_check + - headers_setter + - k8s_observer + - zpages + pipelines: + logs/entities: + exporters: + - otlp_http/entities + processors: + - memory_limiter + - batch + - resourcedetection + - resource + receivers: + - nop + metrics: + exporters: + - signalfx + processors: + - memory_limiter + - batch + - resourcedetection + - resource + receivers: + - hostmetrics + - kubeletstats + - otlp + metrics/agent: + exporters: + - signalfx + processors: + - memory_limiter + - batch + - resource/add_agent_k8s + - resourcedetection + - resource + - resource/add_mode + receivers: + - prometheus/agent + metrics/histograms: + exporters: + - signalfx/histograms + processors: + - memory_limiter + - batch + - resource/add_agent_k8s + - resourcedetection + - resource + receivers: + - receiver_creator + traces: + exporters: + - otlp_http + - signalfx + processors: + - memory_limiter + - k8s_attributes + - batch + - resourcedetection + - resource + receivers: + - otlp + - jaeger + - zipkin + telemetry: + metrics: + readers: + - pull: + exporter: + prometheus: + host: localhost + port: 8889 + without_scope_info: true + without_type_suffix: true + without_units: true + resource: + service.name: otel-agent diff --git a/examples/eks-batch-nodes/rendered_manifests/configmap-cluster-receiver.yaml b/examples/eks-batch-nodes/rendered_manifests/configmap-cluster-receiver.yaml new file mode 100644 index 0000000000..eb6c7c4a2e --- /dev/null +++ b/examples/eks-batch-nodes/rendered_manifests/configmap-cluster-receiver.yaml @@ -0,0 +1,237 @@ +--- +# Source: splunk-otel-collector/templates/configmap-cluster-receiver.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: default-splunk-otel-collector-otel-k8s-cluster-receiver + namespace: default + labels: + app.kubernetes.io/name: splunk-otel-collector + helm.sh/chart: splunk-otel-collector-0.150.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: default + app.kubernetes.io/version: "0.150.0" + app: splunk-otel-collector + chart: splunk-otel-collector-0.150.0 + release: default +data: + relay: | + exporters: + signalfx: + access_token: ${SPLUNK_OBSERVABILITY_ACCESS_TOKEN} + api_url: https://api.CHANGEME.observability.splunkcloud.com + disable_default_translation_rules: true + ingest_url: https://ingest.CHANGEME.observability.splunkcloud.com + timeout: 10s + signalfx/histograms: + access_token: ${SPLUNK_OBSERVABILITY_ACCESS_TOKEN} + api_url: https://api.CHANGEME.observability.splunkcloud.com + disable_default_translation_rules: true + ingest_url: https://ingest.CHANGEME.observability.splunkcloud.com + send_otlp_histograms: true + timeout: 10s + extensions: + health_check: + endpoint: 0.0.0.0:13134 + processors: + batch: + send_batch_max_size: 32768 + memory_limiter: + check_interval: 2s + limit_mib: ${SPLUNK_MEMORY_LIMIT_MIB} + resource: + attributes: + - action: insert + key: metric_source + value: kubernetes + resource/add_cluster_host: + attributes: + - action: insert + key: host.name + value: ${K8S_NODE_NAME} + resource/add_collector_k8s: + attributes: + - action: insert + key: k8s.node.name + value: ${K8S_NODE_NAME} + - action: insert + key: k8s.pod.name + value: ${K8S_POD_NAME} + - action: insert + key: k8s.pod.uid + value: ${K8S_POD_UID} + - action: insert + key: k8s.namespace.name + value: ${K8S_NAMESPACE} + resource/add_mode: + attributes: + - action: insert + key: otelcol.service.mode + value: clusterReceiver + resource/k8s_cluster: + attributes: + - action: insert + key: receiver + value: k8scluster + resourcedetection: + detectors: + - env + - eks + - system + eks: + node_from_env_var: K8S_NODE_NAME + resource_attributes: + cloud.account.id: + enabled: true + cloud.availability_zone: + enabled: true + cloud.region: + enabled: true + host.id: + enabled: true + host.image.id: + enabled: true + host.name: + enabled: true + host.type: + enabled: true + k8s.cluster.name: + enabled: true + override: true + timeout: 15s + resourcedetection/k8s_cluster_name: + detectors: + - eks + eks: + node_from_env_var: K8S_NODE_NAME + resource_attributes: + cloud.platform: + enabled: false + cloud.provider: + enabled: false + k8s.cluster.name: + enabled: true + override: true + timeout: 15s + transform/k8shpascaletargetref: + error_mode: ignore + metric_statements: + - context: resource + statements: + - set(attributes["k8s.replicaset.name"], resource.attributes["k8s.hpa.scaletargetref.name"]) + where IsMatch(resource.attributes["k8s.hpa.scaletargetref.kind"], "ReplicaSet") + - set(attributes["k8s.statefulset.name"], resource.attributes["k8s.hpa.scaletargetref.name"]) + where IsMatch(resource.attributes["k8s.hpa.scaletargetref.kind"], "StatefulSet") + - set(attributes["k8s.deployment.name"], resource.attributes["k8s.hpa.scaletargetref.name"]) + where IsMatch(resource.attributes["k8s.hpa.scaletargetref.kind"], "Deployment") + receivers: + k8s_cluster: + auth_type: serviceAccount + metadata_exporters: + - signalfx + metrics: + k8s.container.status.reason: + enabled: true + k8s.node.condition: + enabled: true + k8s.pod.status_reason: + enabled: true + resource_attributes: + k8s.hpa.scaletargetref.kind: + enabled: true + k8s.hpa.scaletargetref.name: + enabled: true + k8s.kubelet.version: + enabled: true + k8s.pod.qos_class: + enabled: true + prometheus/k8s_cluster_receiver: + config: + scrape_configs: + - job_name: otel-k8s-cluster-receiver + metric_relabel_configs: + - action: drop + regex: promhttp_metric_handler_errors.* + source_labels: + - __name__ + - action: drop + regex: otelcol_processor_batch_.* + source_labels: + - __name__ + - action: drop + regex: .*otelcol\.k8s\.pod\.association + source_labels: + - __name__ + scrape_interval: 10s + static_configs: + - targets: + - localhost:8899 + prometheus/kubernetes-apiserver: + config: + scrape_configs: + - authorization: + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + job_name: kubernetes-apiserver + metric_relabel_configs: + - action: keep + regex: (apiserver_longrunning_requests|apiserver_request_duration_seconds|apiserver_storage_objects|apiserver_response_sizes|apiserver_request_total|kubernetes_build_info|rest_client_requests_total|rest_client_request_duration_seconds|apiserver_storage_size_bytes|apiserver_requested_deprecated_apis)(?:_sum|_count|_bucket)? + source_labels: + - __name__ + scheme: https + scrape_interval: 10s + static_configs: + - targets: + - ${KUBERNETES_SERVICE_HOST}:443 + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + service: + extensions: + - health_check + pipelines: + metrics: + exporters: + - signalfx + processors: + - memory_limiter + - batch + - transform/k8shpascaletargetref + - resourcedetection/k8s_cluster_name + - resource + - resource/k8s_cluster + receivers: + - k8s_cluster + metrics/collector: + exporters: + - signalfx + processors: + - memory_limiter + - batch + - resource/add_collector_k8s + - resourcedetection + - resource + - resource/add_mode + receivers: + - prometheus/k8s_cluster_receiver + metrics/histograms: + exporters: + - signalfx/histograms + processors: + - memory_limiter + - batch + - resourcedetection/k8s_cluster_name + - resource + receivers: + - prometheus/kubernetes-apiserver + telemetry: + metrics: + readers: + - pull: + exporter: + prometheus: + host: localhost + port: 8899 + without_scope_info: true + without_type_suffix: true + without_units: true + resource: + service.name: otel-k8s-cluster-receiver diff --git a/examples/eks-batch-nodes/rendered_manifests/daemonset.yaml b/examples/eks-batch-nodes/rendered_manifests/daemonset.yaml new file mode 100644 index 0000000000..831a252e5b --- /dev/null +++ b/examples/eks-batch-nodes/rendered_manifests/daemonset.yaml @@ -0,0 +1,196 @@ +--- +# Source: splunk-otel-collector/templates/daemonset.yaml +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: default-splunk-otel-collector-agent + namespace: default + labels: + app.kubernetes.io/name: splunk-otel-collector + helm.sh/chart: splunk-otel-collector-0.150.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: default + app.kubernetes.io/version: "0.150.0" + app: splunk-otel-collector + component: otel-collector-agent + chart: splunk-otel-collector-0.150.0 + release: default +spec: + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + selector: + matchLabels: + app: splunk-otel-collector + release: default + template: + metadata: + labels: + app: splunk-otel-collector + component: otel-collector-agent + release: default + annotations: + checksum/config: 68bc76dbdc43f8a6fa241581201db1d0733bb574686ea20acdca705c430bbb89 + kubectl.kubernetes.io/default-container: otel-collector + spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + serviceAccountName: default-splunk-otel-collector + nodeSelector: + kubernetes.io/os: linux + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Exists + - effect: NoSchedule + key: kubernetes.io/system-node + operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/infra + operator: Exists + - effect: NoSchedule + key: batch.amazonaws.com/batch-node + operator: Exists + - effect: NoExecute + key: batch.amazonaws.com/batch-node + operator: Exists + containers: + - name: otel-collector + args: + - --config=/conf/relay.yaml + ports: + - name: jaeger-grpc + containerPort: 14250 + hostPort: 14250 + protocol: TCP + - name: jaeger-thrift + containerPort: 14268 + hostPort: 14268 + protocol: TCP + - name: otlp + containerPort: 4317 + hostPort: 4317 + protocol: TCP + - name: otlp-http + containerPort: 4318 + hostPort: 4318 + protocol: TCP + - name: zipkin + containerPort: 9411 + hostPort: 9411 + protocol: TCP + image: quay.io/signalfx/splunk-otel-collector:0.150.0 + imagePullPolicy: IfNotPresent + env: + - name: SPLUNK_MEMORY_TOTAL_MIB + valueFrom: + resourceFieldRef: + resource: limits.memory + divisor: "1Mi" + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: K8S_NODE_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.hostIP + - name: K8S_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: K8S_POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + - name: K8S_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: SPLUNK_OBSERVABILITY_ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: default-splunk-otel-collector + key: splunk_observability_access_token + + readinessProbe: + httpGet: + path: / + port: 13133 + livenessProbe: + httpGet: + path: / + port: 13133 + resources: + limits: + cpu: 200m + memory: 500Mi + volumeMounts: + - mountPath: /conf + name: otel-configmap + - mountPath: /hostfs/dev + name: host-dev + readOnly: true + - mountPath: /hostfs/etc + name: host-etc + readOnly: true + - mountPath: /hostfs/proc + name: host-proc + readOnly: true + - mountPath: /hostfs/run/udev/data + name: host-run-udev-data + readOnly: true + - mountPath: /hostfs/sys + name: host-sys + readOnly: true + - mountPath: /hostfs/var/run/utmp + name: host-var-run-utmp + readOnly: true + - mountPath: /hostfs/usr/lib/os-release + name: host-usr-lib-osrelease + readOnly: true + - mountPath: /usr/lib/splunk-otel-collector/agent-bundle/run/collectd + name: run-collectd + readOnly: false + terminationGracePeriodSeconds: 600 + volumes: + - name: run-collectd + emptyDir: + sizeLimit: 25Mi + - name: host-dev + hostPath: + path: /dev + - name: host-etc + hostPath: + path: /etc + - name: host-proc + hostPath: + path: /proc + - name: host-run-udev-data + hostPath: + path: /run/udev/data + - name: host-sys + hostPath: + path: /sys + - name: host-var-run-utmp + hostPath: + path: /var/run/utmp + - name: host-usr-lib-osrelease + hostPath: + path: /usr/lib/os-release + - name: otel-configmap + configMap: + name: default-splunk-otel-collector-otel-agent + items: + - key: relay + path: relay.yaml diff --git a/examples/eks-batch-nodes/rendered_manifests/deployment-cluster-receiver.yaml b/examples/eks-batch-nodes/rendered_manifests/deployment-cluster-receiver.yaml new file mode 100644 index 0000000000..2f195d086e --- /dev/null +++ b/examples/eks-batch-nodes/rendered_manifests/deployment-cluster-receiver.yaml @@ -0,0 +1,104 @@ +--- +# Source: splunk-otel-collector/templates/deployment-cluster-receiver.yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: default-splunk-otel-collector-k8s-cluster-receiver + namespace: default + labels: + app.kubernetes.io/name: splunk-otel-collector + helm.sh/chart: splunk-otel-collector-0.150.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: default + app.kubernetes.io/version: "0.150.0" + app: splunk-otel-collector + component: otel-k8s-cluster-receiver + chart: splunk-otel-collector-0.150.0 + release: default + app.kubernetes.io/component: otel-k8s-cluster-receiver +spec: + replicas: 1 + selector: + matchLabels: + app: splunk-otel-collector + component: otel-k8s-cluster-receiver + release: default + template: + metadata: + labels: + app: splunk-otel-collector + component: otel-k8s-cluster-receiver + release: default + annotations: + checksum/config: 8c8e49b875907666d7dcf878f9318ff8c89b4d75c453a227f724876c0604b1b7 + spec: + serviceAccountName: default-splunk-otel-collector + nodeSelector: + kubernetes.io/os: linux + containers: + - name: otel-collector + args: + - --config=/conf/relay.yaml + image: quay.io/signalfx/splunk-otel-collector:0.150.0 + imagePullPolicy: IfNotPresent + env: + - name: SPLUNK_MEMORY_TOTAL_MIB + valueFrom: + resourceFieldRef: + resource: limits.memory + divisor: "1Mi" + - name: K8S_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: K8S_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: K8S_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: K8S_POD_UID + valueFrom: + fieldRef: + fieldPath: metadata.uid + - name: K8S_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: SPLUNK_OBSERVABILITY_ACCESS_TOKEN + valueFrom: + secretKeyRef: + name: default-splunk-otel-collector + key: splunk_observability_access_token + readinessProbe: + httpGet: + path: / + port: 13134 + livenessProbe: + httpGet: + path: / + port: 13134 + resources: + limits: + cpu: 200m + memory: 500Mi + volumeMounts: + - mountPath: /conf + name: collector-configmap + - mountPath: /usr/lib/splunk-otel-collector/agent-bundle/run/collectd + name: run-collectd + readOnly: false + terminationGracePeriodSeconds: 600 + volumes: + - name: collector-configmap + configMap: + name: default-splunk-otel-collector-otel-k8s-cluster-receiver + items: + - key: relay + path: relay.yaml + - name: run-collectd + emptyDir: + sizeLimit: 25Mi diff --git a/examples/eks-batch-nodes/rendered_manifests/secret-splunk.yaml b/examples/eks-batch-nodes/rendered_manifests/secret-splunk.yaml new file mode 100644 index 0000000000..64068b711c --- /dev/null +++ b/examples/eks-batch-nodes/rendered_manifests/secret-splunk.yaml @@ -0,0 +1,19 @@ +--- +# Source: splunk-otel-collector/templates/secret-splunk.yaml +apiVersion: v1 +kind: Secret +metadata: + name: default-splunk-otel-collector + namespace: default + labels: + app.kubernetes.io/name: splunk-otel-collector + helm.sh/chart: splunk-otel-collector-0.150.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: default + app.kubernetes.io/version: "0.150.0" + app: splunk-otel-collector + chart: splunk-otel-collector-0.150.0 + release: default +type: Opaque +data: + splunk_observability_access_token: Q0hBTkdFTUU= diff --git a/examples/eks-batch-nodes/rendered_manifests/service-agent.yaml b/examples/eks-batch-nodes/rendered_manifests/service-agent.yaml new file mode 100644 index 0000000000..5ae19ef9ca --- /dev/null +++ b/examples/eks-batch-nodes/rendered_manifests/service-agent.yaml @@ -0,0 +1,46 @@ +--- +# Source: splunk-otel-collector/templates/service-agent.yaml +apiVersion: v1 +kind: Service +metadata: + name: default-splunk-otel-collector-agent + namespace: default + labels: + app.kubernetes.io/name: splunk-otel-collector + helm.sh/chart: splunk-otel-collector-0.150.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: default + app.kubernetes.io/version: "0.150.0" + app: splunk-otel-collector + component: otel-collector-agent + chart: splunk-otel-collector-0.150.0 + release: default + app.kubernetes.io/component: otel-collector-agent +spec: + type: ClusterIP + ports: + - name: jaeger-grpc + port: 14250 + targetPort: jaeger-grpc + protocol: TCP + - name: jaeger-thrift + port: 14268 + targetPort: jaeger-thrift + protocol: TCP + - name: otlp + port: 4317 + targetPort: otlp + protocol: TCP + - name: otlp-http + port: 4318 + targetPort: otlp-http + protocol: TCP + - name: zipkin + port: 9411 + targetPort: zipkin + protocol: TCP + selector: + app: splunk-otel-collector + component: otel-collector-agent + release: default + internalTrafficPolicy: Local diff --git a/examples/eks-batch-nodes/rendered_manifests/serviceAccount.yaml b/examples/eks-batch-nodes/rendered_manifests/serviceAccount.yaml new file mode 100644 index 0000000000..111db05b6c --- /dev/null +++ b/examples/eks-batch-nodes/rendered_manifests/serviceAccount.yaml @@ -0,0 +1,17 @@ +--- +# Source: splunk-otel-collector/templates/serviceAccount.yaml +apiVersion: v1 +kind: ServiceAccount +automountServiceAccountToken: true +metadata: + name: default-splunk-otel-collector + namespace: default + labels: + app.kubernetes.io/name: splunk-otel-collector + helm.sh/chart: splunk-otel-collector-0.150.0 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: default + app.kubernetes.io/version: "0.150.0" + app: splunk-otel-collector + chart: splunk-otel-collector-0.150.0 + release: default