Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions devdocs/config/CONFIG.md
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,15 @@ InternalMetricsConfig options for the different metrics exporters
| `internal_metrics.bpf_metric_scrape_interval` | `duration` | `OTEL_EBPF_BPF_METRIC_SCRAPE_INTERVAL` | `15s` | `30s`, `5m`, `1ms`, etc | | |
| `internal_metrics.exporter` | `string` | `OTEL_EBPF_INTERNAL_METRICS_EXPORTER` | `disabled` | `disabled`, `otel`, `prometheus` | | |

### `internal_metrics.avoided_services`

AvoidedServicesConfig controls the avoided-services internal metric.

| YAML Path | Type | Env Var | Default | Values | Deprecated | Description |
|---|---|---|---|---|---|---|
| `internal_metrics.avoided_services.disabled` | `boolean` | `OTEL_EBPF_INTERNAL_METRICS_AVOIDED_SERVICES_DISABLED` | `false` | | | Disables the avoided-services internal metric. |
| `internal_metrics.avoided_services.limit` | `integer` | `OTEL_EBPF_INTERNAL_METRICS_AVOIDED_SERVICES_LIMIT` | `2000` | | | Bounds the number of avoided-services metric series, including the overflow series. 0 uses the OpenTelemetry default metric cardinality limit. |

### `internal_metrics.prometheus`

TODO: TLS
Expand Down
21 changes: 21 additions & 0 deletions devdocs/config/config-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,24 @@
"type": "object",
"description": "AttributesConfig stores the user-provided section for filtering either Application or Network records by attribute values"
},
"AvoidedServicesConfig": {
"properties": {
"disabled": {
"type": "boolean",
"description": "Disables the avoided-services internal metric.",
"default": false,
"x-env-var": "OTEL_EBPF_INTERNAL_METRICS_AVOIDED_SERVICES_DISABLED"
},
"limit": {
"type": "integer",
"description": "Bounds the number of avoided-services metric series, including the overflow series. 0 uses the OpenTelemetry default metric cardinality limit.",
"default": 2000,
"x-env-var": "OTEL_EBPF_INTERNAL_METRICS_AVOIDED_SERVICES_LIMIT"
}
},
"type": "object",
"description": "AvoidedServicesConfig controls the avoided-services internal metric."
},
"BedrockConfig": {
"properties": {
"enabled": {
Expand Down Expand Up @@ -1370,6 +1388,9 @@
},
"InternalMetricsConfig": {
"properties": {
"avoided_services": {
"$ref": "#/$defs/AvoidedServicesConfig"
},
"bpf_metric_scrape_interval": {
"type": "string",
"pattern": "^[0-9]+(ms|s|m)$",
Expand Down
19 changes: 12 additions & 7 deletions devdocs/exclude-otel-instrumented-services.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,12 +98,17 @@ Once a service has its `ExportsOTelMetrics` / `ExportsOTelTraces` /
- Prometheus — RED-metrics and span-metrics filters in [`pkg/export/prom/prom.go`](../pkg/export/prom/prom.go).

Each detection event increments the `obi.avoided.services` internal metric
(Prometheus name: `obi_avoided_services`), labeled with the service identity
and the telemetry type that was avoided (`metrics` or `traces`). Span-metrics
suppression is reported under the `metrics` label, not a separate one — the
`metrics_span` detection path in `reportAvoidedService` routes through
`AvoidInstrumentationMetrics`, which emits `metrics`. It's emitted from
`reportAvoidedService` in
(Prometheus name: `obi_avoided_services`). Normal series are labeled with the
logical service name, service namespace, and the telemetry type that was
avoided (`metrics` or `traces`). The service instance ID is intentionally not
reported because it is unique per service instance and would churn backend
time series. When the configured cardinality limit is reached, additional
detections are collapsed before export and reported through the OpenTelemetry
overflow attribute `otel.metric.overflow=true` (Prometheus label:
`otel_metric_overflow="true"`). Span-metrics suppression is reported under the
`metrics` label, not a separate one — the `metrics_span` detection path in
`reportAvoidedService` routes through `AvoidInstrumentationMetrics`, which
emits `metrics`. It's emitted from `reportAvoidedService` in
[`pkg/ebpf/common/pids.go`](../pkg/ebpf/common/pids.go) and is the
authoritative signal for whether detection has fired for a given service.

Expand Down Expand Up @@ -167,4 +172,4 @@ detection sees only "this service exports OTLP", it cannot tell the SDK's
output apart from its own — so it disables its entire auto-instrumentation
suite for that service. The user ends up with custom telemetry but no HTTP /
gRPC / SQL / Redis / … telemetry, even though the SDK never produced any of
those.
those.
3 changes: 0 additions & 3 deletions internal/test/integration/internal_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,6 @@ func checkAvoidedServicesMetrics(t *testing.T) {
assert.Condition(ct, func() bool {
return labelMap["telemetry_type"] == "metrics" || labelMap["telemetry_type"] == "traces"
}, "telemetry_type label should be either 'metrics' or 'traces'")
// service_instance_id can be empty, but should be present
_, ok = labelMap["service_instance_id"]
assert.True(ct, ok, "service_instance_id label should be present")

if metric.Gauge != nil {
assert.Greater(ct, metric.Gauge.GetValue(), float64(0), "Expected avoided service metric value to be > 0")
Expand Down
13 changes: 13 additions & 0 deletions pkg/export/imetrics/avoided_services.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package imetrics // import "go.opentelemetry.io/obi/pkg/export/imetrics"

// AvoidedServicesConfig controls the avoided-services internal metric.
type AvoidedServicesConfig struct {
// Disabled disables the avoided-services internal metric.
Disabled bool `yaml:"disabled" env:"OTEL_EBPF_INTERNAL_METRICS_AVOIDED_SERVICES_DISABLED"`
// Limit bounds the number of avoided-services metric series, including the overflow series.
// 0 uses the OpenTelemetry default metric cardinality limit.
Limit int `yaml:"limit" env:"OTEL_EBPF_INTERNAL_METRICS_AVOIDED_SERVICES_LIMIT" validate:"gte=0"`
}
1 change: 1 addition & 0 deletions pkg/export/imetrics/imetrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ const (
// InternalMetricsConfig options for the different metrics exporters
type InternalMetricsConfig struct {
Prometheus PrometheusConfig `yaml:"prometheus,omitempty"`
AvoidedServices AvoidedServicesConfig `yaml:"avoided_services,omitempty"`
Exporter InternalMetricsExporter `yaml:"exporter,omitempty" env:"OTEL_EBPF_INTERNAL_METRICS_EXPORTER" validate:"omitempty,oneof=disabled prometheus otel"`
BpfMetricScrapeInterval time.Duration `yaml:"bpf_metric_scrape_interval" env:"OTEL_EBPF_BPF_METRIC_SCRAPE_INTERVAL" validate:"omitempty,gt=0"`
}
Expand Down
77 changes: 77 additions & 0 deletions pkg/export/imetrics/imetrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ import (
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

attr "go.opentelemetry.io/obi/pkg/export/attributes/names"
"go.opentelemetry.io/obi/pkg/internal/avoidedsvc"
)

func TestIsBuiltinNoopReporter(t *testing.T) {
Expand Down Expand Up @@ -61,3 +64,77 @@ type noopEmbeddingReporter struct {
}

func (n *noopEmbeddingReporter) BpfProbeStats(_, _, _ string, _ uint64, _ float64) {}

func TestPrometheusReporterAvoidedServicesBounded(t *testing.T) {
registry := prometheus.NewRegistry()
reporter := NewPrometheusReporter(&InternalMetricsConfig{
AvoidedServices: AvoidedServicesConfig{Limit: 3},
}, nil, registry)

reporter.AvoidInstrumentationMetrics("svc-0", "ns-0", "inst-0")
reporter.AvoidInstrumentationTraces("svc-0", "ns-0", "inst-0")
reporter.AvoidInstrumentationMetrics("svc-1", "ns-1", "inst-1")
reporter.AvoidInstrumentationTraces("svc-1", "ns-1", "inst-1")

metrics := gatherAvoidedServices(t, registry)
require.Len(t, metrics, 3)

labelSets := map[string]struct{}{}
overflowRecords := 0
for _, metric := range metrics {
labels := metricLabels(metric)
if labels[avoidedsvc.PrometheusOverflowLabel] == "true" {
overflowRecords++
assert.Empty(t, labels["service_name"])
assert.Empty(t, labels["service_namespace"])
assert.Empty(t, labels["telemetry_type"])
continue
}

assert.Equal(t, "false", labels[avoidedsvc.PrometheusOverflowLabel])
labelSets[labels["service_name"]+"/"+
labels["service_namespace"]+"/"+
labels["telemetry_type"]] = struct{}{}
}

assert.Contains(t, labelSets, "svc-0/ns-0/metrics")
assert.Contains(t, labelSets, "svc-0/ns-0/traces")
assert.Equal(t, 1, overflowRecords)
}

func TestPrometheusReporterAvoidedServicesDisabled(t *testing.T) {
registry := prometheus.NewRegistry()
reporter := NewPrometheusReporter(&InternalMetricsConfig{
AvoidedServices: AvoidedServicesConfig{Disabled: true},
}, nil, registry)

reporter.AvoidInstrumentationMetrics("svc-0", "ns-0", "inst-0")

mfs, err := registry.Gather()
require.NoError(t, err)
for _, mf := range mfs {
assert.NotEqual(t, attr.VendorPrefix+"_avoided_services", mf.GetName())
}
}

func gatherAvoidedServices(t *testing.T, registry *prometheus.Registry) []*dto.Metric {
t.Helper()

mfs, err := registry.Gather()
require.NoError(t, err)
for _, mf := range mfs {
if mf.GetName() == attr.VendorPrefix+"_avoided_services" {
return mf.GetMetric()
}
}
require.Fail(t, "missing avoided services metric")
return nil
}

func metricLabels(metric *dto.Metric) map[string]string {
labels := map[string]string{}
for _, pair := range metric.GetLabel() {
labels[pair.GetName()] = pair.GetValue()
}
return labels
}
29 changes: 23 additions & 6 deletions pkg/export/imetrics/iprom.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"go.opentelemetry.io/obi/pkg/buildinfo"
attr "go.opentelemetry.io/obi/pkg/export/attributes/names"
"go.opentelemetry.io/obi/pkg/export/connector"
"go.opentelemetry.io/obi/pkg/internal/avoidedsvc"
)

// pipelineBufferLengths buckets for histogram metrics about the number of traces submitted from one stage to another
Expand All @@ -37,6 +38,7 @@ type PrometheusReporter struct {
instrumentedProcesses *prometheus.GaugeVec
instrumentationErrors *prometheus.CounterVec
avoidedServices *prometheus.GaugeVec
avoidedServicesLimiter *avoidedsvc.Limiter
buildInfo prometheus.Gauge
bpfProbeExecutions *prometheus.CounterVec
bpfProbeLatencySum *prometheus.CounterVec
Expand Down Expand Up @@ -93,10 +95,6 @@ func NewPrometheusReporter(cfg *InternalMetricsConfig, manager *connector.Promet
Name: attr.VendorPrefix + "_instrumentation_errors_total",
Help: "Total number of instrumentation errors by process name and error type",
}, []string{"process_name", "error_type"}),
avoidedServices: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: attr.VendorPrefix + "_avoided_services",
Help: "Services avoided due to existing OpenTelemetry instrumentation",
}, []string{"service_name", "service_namespace", "service_instance_id", "telemetry_type"}),
buildInfo: prometheus.NewGauge(prometheus.GaugeOpts{
Name: attr.VendorPrefix + "_internal_build_info",
Help: "A metric with a constant '1' value labeled by version, revision, branch, " +
Expand Down Expand Up @@ -148,6 +146,18 @@ func NewPrometheusReporter(cfg *InternalMetricsConfig, manager *connector.Promet
Help: "Ratio [0-1] between the unread messages of an internal Go channel and its total capacity",
}, []string{"subscriber"}),
}
if !cfg.AvoidedServices.Disabled {
pr.avoidedServicesLimiter = avoidedsvc.NewLimiter(cfg.AvoidedServices.Limit)
pr.avoidedServices = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: attr.VendorPrefix + "_avoided_services",
Help: "Services avoided due to existing OpenTelemetry instrumentation",
}, []string{
"service_name",
"service_namespace",
"telemetry_type",
avoidedsvc.PrometheusOverflowLabel,
})
}
metrics := []prometheus.Collector{
pr.tracerFlushes,
pr.otelMetricExports,
Expand All @@ -157,7 +167,6 @@ func NewPrometheusReporter(cfg *InternalMetricsConfig, manager *connector.Promet
pr.prometheusRequests,
pr.instrumentedProcesses,
pr.instrumentationErrors,
pr.avoidedServices,
pr.buildInfo,
pr.bpfProbeExecutions,
pr.bpfProbeLatencySum,
Expand All @@ -168,6 +177,9 @@ func NewPrometheusReporter(cfg *InternalMetricsConfig, manager *connector.Promet
pr.bpfIgnoredPacketCount,
pr.queueCapacityRatio,
}
if pr.avoidedServices != nil {
metrics = append(metrics, pr.avoidedServices)
}
if registry != nil {
registry.MustRegister(metrics...)
} else {
Expand Down Expand Up @@ -221,7 +233,12 @@ func (p *PrometheusReporter) InstrumentationError(processName string, errorType
}

func (p *PrometheusReporter) recordAvoidedService(serviceName, serviceNamespace, serviceInstanceID, telemetryType string) {
p.avoidedServices.WithLabelValues(serviceName, serviceNamespace, serviceInstanceID, telemetryType).Set(1)
if p.avoidedServices == nil {
return
}

labels := p.avoidedServicesLimiter.Labels(serviceName, serviceNamespace, serviceInstanceID, telemetryType)
p.avoidedServices.WithLabelValues(labels.PrometheusValues()...).Set(1)
}

func (p *PrometheusReporter) AvoidInstrumentationMetrics(serviceName, serviceNamespace, serviceInstanceID string) {
Expand Down
41 changes: 30 additions & 11 deletions pkg/export/otel/metrics_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
attr "go.opentelemetry.io/obi/pkg/export/attributes/names"
"go.opentelemetry.io/obi/pkg/export/imetrics"
"go.opentelemetry.io/obi/pkg/export/otel/otelcfg"
"go.opentelemetry.io/obi/pkg/internal/avoidedsvc"
"go.opentelemetry.io/obi/pkg/pipe/global"
)

Expand All @@ -36,6 +37,7 @@ type InternalMetricsReporter struct {
instrumentedProcesses instrument.Int64UpDownCounter
instrumentationErrors instrument.Int64Counter
avoidedServices instrument.Int64Gauge
avoidedServicesLimiter *avoidedsvc.Limiter
buildInfo instrument.Int64Gauge
bpfProbeExecutions instrument.Int64Counter
bpfProbeLatencySum instrument.Float64Counter
Expand Down Expand Up @@ -125,12 +127,17 @@ func NewInternalMetricsReporter(ctx context.Context, ctxInfo *global.ContextInfo
return nil, err
}

avoidedServices, err := meter.Int64Gauge(
attr.VendorPrefix+".avoided.services",
instrument.WithDescription("Services avoided due to existing OpenTelemetry instrumentation"),
)
if err != nil {
return nil, err
var avoidedServices instrument.Int64Gauge
var avoidedServicesLimiter *avoidedsvc.Limiter
if !internalMetrics.AvoidedServices.Disabled {
avoidedServices, err = meter.Int64Gauge(
attr.VendorPrefix+".avoided.services",
instrument.WithDescription("Services avoided due to existing OpenTelemetry instrumentation"),
)
if err != nil {
return nil, err
}
avoidedServicesLimiter = avoidedsvc.NewLimiter(internalMetrics.AvoidedServices.Limit)
}

buildInfo, err := meter.Int64Gauge(
Expand Down Expand Up @@ -220,6 +227,7 @@ func NewInternalMetricsReporter(ctx context.Context, ctxInfo *global.ContextInfo
instrumentedProcesses: instrumentedProcesses,
instrumentationErrors: instrumentationErrors,
avoidedServices: avoidedServices,
avoidedServicesLimiter: avoidedServicesLimiter,
buildInfo: buildInfo,
bpfProbeExecutions: bpfProbeExecutions,
bpfProbeLatencySum: bpfProbeLatencySum,
Expand Down Expand Up @@ -302,11 +310,22 @@ func newResourceInternal(nodeMeta *meta.NodeMeta) *resource.Resource {
}

func (p *InternalMetricsReporter) recordAvoidedService(serviceName, serviceNamespace, serviceInstanceID, telemetryType string) {
attrs := []attribute.KeyValue{
semconv.ServiceName(serviceName),
semconv.ServiceNamespace(serviceNamespace),
semconv.ServiceInstanceID(serviceInstanceID),
attribute.String("telemetry.type", telemetryType),
if p.avoidedServices == nil {
return
}

labels := p.avoidedServicesLimiter.Labels(serviceName, serviceNamespace, serviceInstanceID, telemetryType)
var attrs []attribute.KeyValue
if labels.Overflow {
attrs = []attribute.KeyValue{
attribute.Bool(avoidedsvc.OverflowAttribute, true),
}
} else {
attrs = []attribute.KeyValue{
semconv.ServiceName(labels.ServiceName),
semconv.ServiceNamespace(labels.ServiceNamespace),
attribute.String("telemetry.type", labels.TelemetryType),
}
}

p.avoidedServices.Record(p.ctx, 1, instrument.WithAttributes(attrs...))
Expand Down
Loading
Loading