Skip to content

Commit 17eb1c3

Browse files
committed
Add detailed failure attributes to exporter send_failed metrics
Signed-off-by: Israel Blancas <iblancasa@gmail.com> fix Signed-off-by: Israel Blancas <iblancasa@gmail.com>
1 parent fd17e51 commit 17eb1c3

File tree

23 files changed

+301
-34
lines changed

23 files changed

+301
-34
lines changed

.chloggen/13956.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. receiver/otlp)
7+
component: all
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: Add detailed failure attributes to exporter send_failed metrics at detailed telemetry level.
11+
12+
# One or more tracking issues or pull requests related to the change
13+
issues: [13956]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext: |-
19+
The `otelcol_exporter_send_failed_{spans,metric_points,log_records}` metrics now include detailed
20+
failure attributes when telemetry level is Detailed or higher: `failure.reason` (categorizes
21+
failure reason) and `failure.permanent` (indicates if error is permanent).
22+
This enables better alerting and debugging by distinguishing between retries exhausted, permanent
23+
errors, shutdown, context cancellation, and transient errors.
24+
25+
# Optional: The change log or logs in which this entry should be included.
26+
# e.g. '[user]' or '[user, api]'
27+
# Include 'user' if the change is relevant to end users.
28+
# Include 'api' if there is a change to a library API.
29+
# Default: '[user]'
30+
change_logs: [user]

exporter/exporterhelper/documentation.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,23 +64,23 @@ Current size of the retry queue (in batches). [Alpha]
6464

6565
### otelcol_exporter_send_failed_log_records
6666

67-
Number of log records in failed attempts to send to destination. [Alpha]
67+
Number of log records in failed attempts to send to destination. At detailed telemetry level, includes attributes: failure.reason, failure.permanent. [Alpha]
6868

6969
| Unit | Metric Type | Value Type | Monotonic | Stability |
7070
| ---- | ----------- | ---------- | --------- | --------- |
7171
| {records} | Sum | Int | true | Alpha |
7272

7373
### otelcol_exporter_send_failed_metric_points
7474

75-
Number of metric points in failed attempts to send to destination. [Alpha]
75+
Number of metric points in failed attempts to send to destination. At detailed telemetry level, includes attributes: failure.reason, failure.permanent. [Alpha]
7676

7777
| Unit | Metric Type | Value Type | Monotonic | Stability |
7878
| ---- | ----------- | ---------- | --------- | --------- |
7979
| {datapoints} | Sum | Int | true | Alpha |
8080

8181
### otelcol_exporter_send_failed_spans
8282

83-
Number of spans in failed attempts to send to destination. [Alpha]
83+
Number of spans in failed attempts to send to destination. At detailed telemetry level, includes attributes: failure.reason, failure.permanent. [Alpha]
8484

8585
| Unit | Metric Type | Value Type | Monotonic | Stability |
8686
| ---- | ----------- | ---------- | --------- | --------- |

exporter/exporterhelper/go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ require (
1010
go.opentelemetry.io/collector/component/componenttest v0.141.0
1111
go.opentelemetry.io/collector/config/configoptional v1.47.0
1212
go.opentelemetry.io/collector/config/configretry v1.47.0
13+
go.opentelemetry.io/collector/config/configtelemetry v0.141.0
1314
go.opentelemetry.io/collector/confmap v1.47.0
1415
go.opentelemetry.io/collector/confmap/xconfmap v0.141.0
1516
go.opentelemetry.io/collector/consumer v1.47.0
@@ -89,6 +90,8 @@ replace go.opentelemetry.io/collector/receiver => ../../receiver
8990

9091
replace go.opentelemetry.io/collector/config/configretry => ../../config/configretry
9192

93+
replace go.opentelemetry.io/collector/config/configtelemetry => ../../config/configtelemetry
94+
9295
replace go.opentelemetry.io/collector/consumer/xconsumer => ../../consumer/xconsumer
9396

9497
replace go.opentelemetry.io/collector/consumer/consumertest => ../../consumer/consumertest

exporter/exporterhelper/internal/metadata/generated_telemetry.go

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exporter/exporterhelper/internal/metadatatest/generated_telemetrytest.go

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exporter/exporterhelper/internal/obs_report_sender.go

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,19 @@ package internal // import "go.opentelemetry.io/collector/exporter/exporterhelpe
55

66
import (
77
"context"
8+
"errors"
9+
"strings"
810

911
"go.opentelemetry.io/otel/attribute"
10-
"go.opentelemetry.io/otel/codes"
12+
otelcodes "go.opentelemetry.io/otel/codes"
1113
"go.opentelemetry.io/otel/metric"
1214
"go.opentelemetry.io/otel/trace"
1315

1416
"go.opentelemetry.io/collector/component"
17+
"go.opentelemetry.io/collector/config/configtelemetry"
18+
"go.opentelemetry.io/collector/consumer/consumererror"
1519
"go.opentelemetry.io/collector/exporter"
20+
"go.opentelemetry.io/collector/exporter/exporterhelper/internal/experr"
1621
"go.opentelemetry.io/collector/exporter/exporterhelper/internal/metadata"
1722
"go.opentelemetry.io/collector/exporter/exporterhelper/internal/queuebatch"
1823
"go.opentelemetry.io/collector/exporter/exporterhelper/internal/request"
@@ -36,6 +41,15 @@ const (
3641
ItemsFailed = "items.failed"
3742
)
3843

44+
// FailureAttributeKey represents attribute keys for detailed failure information.
45+
// These attributes are only emitted at detailed telemetry level.
46+
type FailureAttributeKey string
47+
48+
const (
49+
FailureReasonKey FailureAttributeKey = "failure.reason"
50+
FailurePermanentKey FailureAttributeKey = "failure.permanent"
51+
)
52+
3953
type obsReportSender[K request.Request] struct {
4054
component.StartFunc
4155
component.ShutdownFunc
@@ -46,6 +60,8 @@ type obsReportSender[K request.Request] struct {
4660
metricAttr metric.MeasurementOption
4761
itemsSentInst metric.Int64Counter
4862
itemsFailedInst metric.Int64Counter
63+
telemetryLevel configtelemetry.Level
64+
exporterID string
4965
next sender.Sender[K]
5066
}
5167

@@ -57,13 +73,16 @@ func newObsReportSender[K request.Request](set exporter.Settings, signal pipelin
5773

5874
idStr := set.ID.String()
5975
expAttr := attribute.String(ExporterKey, idStr)
76+
telemetryLevel := configtelemetry.LevelDetailed
6077

6178
or := &obsReportSender[K]{
62-
spanName: ExporterKey + spanNameSep + idStr + spanNameSep + signal.String(),
63-
tracer: metadata.Tracer(set.TelemetrySettings),
64-
spanAttrs: trace.WithAttributes(expAttr, attribute.String(DataTypeKey, signal.String())),
65-
metricAttr: metric.WithAttributeSet(attribute.NewSet(expAttr)),
66-
next: next,
79+
spanName: ExporterKey + spanNameSep + idStr + spanNameSep + signal.String(),
80+
tracer: metadata.Tracer(set.TelemetrySettings),
81+
spanAttrs: trace.WithAttributes(expAttr, attribute.String(DataTypeKey, signal.String())),
82+
metricAttr: metric.WithAttributeSet(attribute.NewSet(expAttr)),
83+
telemetryLevel: telemetryLevel,
84+
exporterID: idStr,
85+
next: next,
6786
}
6887

6988
switch signal {
@@ -112,9 +131,15 @@ func (ors *obsReportSender[K]) endOp(ctx context.Context, numLogRecords int, err
112131
if ors.itemsSentInst != nil {
113132
ors.itemsSentInst.Add(ctx, numSent, ors.metricAttr)
114133
}
115-
// No metrics recorded for profiles.
116-
if ors.itemsFailedInst != nil {
117-
ors.itemsFailedInst.Add(ctx, numFailedToSend, ors.metricAttr)
134+
if ors.itemsFailedInst != nil && numFailedToSend > 0 {
135+
if ors.telemetryLevel >= configtelemetry.LevelDetailed {
136+
failedAttrs := extractFailureAttributes(err)
137+
baseAttrs := attribute.NewSet(attribute.String(ExporterKey, ors.exporterID))
138+
combinedAttrs := attribute.NewSet(append(baseAttrs.ToSlice(), failedAttrs.ToSlice()...)...)
139+
ors.itemsFailedInst.Add(ctx, numFailedToSend, metric.WithAttributeSet(combinedAttrs))
140+
} else {
141+
ors.itemsFailedInst.Add(ctx, numFailedToSend, ors.metricAttr)
142+
}
118143
}
119144

120145
span := trace.SpanFromContext(ctx)
@@ -126,7 +151,7 @@ func (ors *obsReportSender[K]) endOp(ctx context.Context, numLogRecords int, err
126151
attribute.Int64(ItemsFailed, numFailedToSend),
127152
)
128153
if err != nil {
129-
span.SetStatus(codes.Error, err.Error())
154+
span.SetStatus(otelcodes.Error, err.Error())
130155
}
131156
}
132157
}
@@ -137,3 +162,38 @@ func toNumItems(numExportedItems int, err error) (int64, int64) {
137162
}
138163
return int64(numExportedItems), 0
139164
}
165+
166+
func extractFailureAttributes(err error) attribute.Set {
167+
if err == nil {
168+
return attribute.NewSet()
169+
}
170+
171+
attrs := []attribute.KeyValue{}
172+
reason := determineFailureReason(err)
173+
attrs = append(attrs, attribute.String(string(FailureReasonKey), reason))
174+
isPermanent := consumererror.IsPermanent(err)
175+
attrs = append(attrs, attribute.Bool(string(FailurePermanentKey), isPermanent))
176+
return attribute.NewSet(attrs...)
177+
}
178+
179+
func determineFailureReason(err error) string {
180+
if err == nil {
181+
return ""
182+
}
183+
if strings.Contains(err.Error(), "no more retries left") {
184+
return "retries_exhausted"
185+
}
186+
if experr.IsShutdownErr(err) {
187+
return "shutdown"
188+
}
189+
if consumererror.IsPermanent(err) {
190+
return "permanent_error"
191+
}
192+
if errors.Is(err, context.Canceled) {
193+
return "context_cancelled"
194+
}
195+
if errors.Is(err, context.DeadlineExceeded) {
196+
return "context_timeout"
197+
}
198+
return "transient_error"
199+
}

0 commit comments

Comments
 (0)