Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .chloggen/fix_retryInterval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: enhancement

# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog)
component: connector/failover

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Attempt to retry unhealthy pipelines

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [46820]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: []
68 changes: 29 additions & 39 deletions connector/failoverconnector/failover_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ import (
)

func TestFailoverRecovery(t *testing.T) {
var sinkFirst, sinkSecond, sinkThird, sinkFourth consumertest.TracesSink
tracesFirst := pipeline.NewIDWithName(pipeline.SignalTraces, "traces/first")
tracesSecond := pipeline.NewIDWithName(pipeline.SignalTraces, "traces/second")
tracesThird := pipeline.NewIDWithName(pipeline.SignalTraces, "traces/third")
Expand All @@ -29,48 +28,48 @@ func TestFailoverRecovery(t *testing.T) {
RetryInterval: 50 * time.Millisecond,
}

router := connector.NewTracesRouter(map[pipeline.ID]consumer.Traces{
tracesFirst: &sinkFirst,
tracesSecond: &sinkSecond,
tracesThird: &sinkThird,
tracesFourth: &sinkFourth,
})
tr := sampleTrace()

conn, err := NewFactory().CreateTracesToTraces(t.Context(),
connectortest.NewNopSettings(metadata.Type), cfg, router.(consumer.Traces))
newTestConnector := func(t *testing.T) (*tracesFailover, *tracesRouter, *consumertest.TracesSink, *consumertest.TracesSink, *consumertest.TracesSink, *consumertest.TracesSink) {
t.Helper()

require.NoError(t, err)
var sinkFirst, sinkSecond, sinkThird, sinkFourth consumertest.TracesSink
router := connector.NewTracesRouter(map[pipeline.ID]consumer.Traces{
tracesFirst: &sinkFirst,
tracesSecond: &sinkSecond,
tracesThird: &sinkThird,
tracesFourth: &sinkFourth,
})

failoverConnector := conn.(*tracesFailover)
tRouter := failoverConnector.failover
conn, err := NewFactory().CreateTracesToTraces(t.Context(),
connectortest.NewNopSettings(metadata.Type), cfg, router.(consumer.Traces))
require.NoError(t, err)

tr := sampleTrace()
failoverConnector := conn.(*tracesFailover)
t.Cleanup(func() {
assert.NoError(t, failoverConnector.Shutdown(t.Context()))
})

defer func() {
assert.NoError(t, failoverConnector.Shutdown(t.Context()))
}()
return failoverConnector, failoverConnector.failover, &sinkFirst, &sinkSecond, &sinkThird, &sinkFourth
}

t.Run("single failover recovery to primary consumer: level 2 -> 1", func(t *testing.T) {
defer func() {
resetConsumers(tRouter, &sinkFirst, &sinkSecond, &sinkThird, &sinkFourth)
}()
failoverConnector, tRouter, sinkFirst, _, _, _ := newTestConnector(t)
failoverConnector.failover.ModifyConsumerAtIndex(0, consumertest.NewErr(errTracesConsumer))

require.NoError(t, conn.ConsumeTraces(t.Context(), tr))
require.NoError(t, failoverConnector.ConsumeTraces(t.Context(), tr))
idx := failoverConnector.failover.TestGetCurrentConsumerIndex()
require.Equal(t, 1, idx)

failoverConnector.failover.ModifyConsumerAtIndex(0, &sinkFirst)
failoverConnector.failover.ModifyConsumerAtIndex(0, sinkFirst)

require.Eventually(t, func() bool {
return consumeTracesAndCheckStable(tRouter, 0, tr)
}, 3*time.Second, 5*time.Millisecond)
})

t.Run("double failover recovery: level 3 -> 2 -> 1", func(t *testing.T) {
defer func() {
resetConsumers(tRouter, &sinkFirst, &sinkSecond, &sinkThird, &sinkFourth)
}()
failoverConnector, tRouter, sinkFirst, sinkSecond, _, _ := newTestConnector(t)
failoverConnector.failover.ModifyConsumerAtIndex(0, consumertest.NewErr(errTracesConsumer))
failoverConnector.failover.ModifyConsumerAtIndex(1, consumertest.NewErr(errTracesConsumer))

Expand All @@ -79,23 +78,21 @@ func TestFailoverRecovery(t *testing.T) {
}, 3*time.Second, 5*time.Millisecond)

// Simulate recovery of exporter
failoverConnector.failover.ModifyConsumerAtIndex(1, &sinkSecond)
failoverConnector.failover.ModifyConsumerAtIndex(1, sinkSecond)

require.Eventually(t, func() bool {
return consumeTracesAndCheckStable(tRouter, 1, tr)
}, 3*time.Second, 5*time.Millisecond)

failoverConnector.failover.ModifyConsumerAtIndex(0, &sinkFirst)
failoverConnector.failover.ModifyConsumerAtIndex(0, sinkFirst)

require.Eventually(t, func() bool {
return consumeTracesAndCheckStable(tRouter, 0, tr)
}, 3*time.Second, 5*time.Millisecond)
})

t.Run("multiple failover recovery: level 3 -> 2 -> 4 -> 3 -> 1", func(t *testing.T) {
defer func() {
resetConsumers(tRouter, &sinkFirst, &sinkSecond, &sinkThird, &sinkFourth)
}()
failoverConnector, tRouter, sinkFirst, sinkSecond, sinkThird, _ := newTestConnector(t)
failoverConnector.failover.ModifyConsumerAtIndex(0, consumertest.NewErr(errTracesConsumer))
failoverConnector.failover.ModifyConsumerAtIndex(1, consumertest.NewErr(errTracesConsumer))

Expand All @@ -104,7 +101,7 @@ func TestFailoverRecovery(t *testing.T) {
}, 3*time.Second, 5*time.Millisecond)

// Simulate recovery of exporter
failoverConnector.failover.ModifyConsumerAtIndex(1, &sinkSecond)
failoverConnector.failover.ModifyConsumerAtIndex(1, sinkSecond)

require.Eventually(t, func() bool {
return consumeTracesAndCheckStable(tRouter, 1, tr)
Expand All @@ -117,23 +114,16 @@ func TestFailoverRecovery(t *testing.T) {
return consumeTracesAndCheckStable(tRouter, 3, tr)
}, 3*time.Second, 5*time.Millisecond)

failoverConnector.failover.ModifyConsumerAtIndex(2, &sinkThird)
failoverConnector.failover.ModifyConsumerAtIndex(2, sinkThird)

require.Eventually(t, func() bool {
return consumeTracesAndCheckStable(tRouter, 2, tr)
}, 3*time.Second, 5*time.Millisecond)

failoverConnector.failover.ModifyConsumerAtIndex(0, &sinkThird)
failoverConnector.failover.ModifyConsumerAtIndex(0, sinkFirst)

require.Eventually(t, func() bool {
return consumeTracesAndCheckStable(tRouter, 0, tr)
}, 3*time.Second, 5*time.Millisecond)
})
}

func resetConsumers(router *tracesRouter, consumers ...consumer.Traces) {
for i, sink := range consumers {
router.ModifyConsumerAtIndex(i, sink)
}
router.TestSetStableConsumerIndex(0)
}
6 changes: 6 additions & 0 deletions connector/failoverconnector/logs.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ func (f *logsRouter) consumeByHealthyPipeline(ctx context.Context, ld plog.Logs)
}

if err := tc.ConsumeLogs(ctx, ld); err != nil {
if idx > 0 && idx == len(f.cfg.PipelinePriority)-1 {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the idx > 0 test do for us? I take it we want to try at least once even when all servers are unhealthy?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i just thought if a user configure the failover connector with just only a single pipeline therefore added idx > 0 just to prevent unnecessary fn call

and yes we want to try one last time even when all servers are unhealthy

if f.sampleRetryConsumers(ctx, ld) {
return nil
}
}

f.reportConsumerError(idx)
continue
}
Expand Down
42 changes: 42 additions & 0 deletions connector/failoverconnector/logs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,48 @@ func TestLogsWithQueue(t *testing.T) {
assert.NoError(t, conn.ConsumeLogs(t.Context(), ld))
}

func TestLogsImmediateRetryOnLastPipelineFailure(t *testing.T) {
var sinkFirst, sinkSecond consumertest.LogsSink
logsFirst := pipeline.NewIDWithName(pipeline.SignalLogs, "logs/first")
logsSecond := pipeline.NewIDWithName(pipeline.SignalLogs, "logs/second")

cfg := &Config{
PipelinePriority: [][]pipeline.ID{{logsFirst}, {logsSecond}},
RetryInterval: 5 * time.Minute, // Long interval so background retry doesn't interfere
}

router := connector.NewLogsRouter(map[pipeline.ID]consumer.Logs{
logsFirst: &sinkFirst,
logsSecond: &sinkSecond,
})

conn, err := NewFactory().CreateLogsToLogs(t.Context(),
connectortest.NewNopSettings(metadata.Type), cfg, router.(consumer.Logs))
require.NoError(t, err)

failoverConnector := conn.(*logsFailover)
defer func() {
assert.NoError(t, failoverConnector.Shutdown(t.Context()))
}()

ld := sampleLog()

// making the 1st pipeline to fail
failoverConnector.failover.ModifyConsumerAtIndex(0, consumertest.NewErr(errLogsConsumer))

err = failoverConnector.ConsumeLogs(t.Context(), ld)
assert.NoError(t, err)
assert.Equal(t, 1, failoverConnector.failover.pS.CurrentPipeline())

// making the 2nd pipeline to fail and recovering the 1st one
failoverConnector.failover.ModifyConsumerAtIndex(1, consumertest.NewErr(errLogsConsumer))
failoverConnector.failover.ModifyConsumerAtIndex(0, &sinkFirst)

err = failoverConnector.ConsumeLogs(t.Context(), ld)
assert.NoError(t, err)
assert.Equal(t, 0, failoverConnector.failover.pS.CurrentPipeline())
}

func consumeLogsAndCheckStable(conn *logsFailover, idx int, lr plog.Logs) bool {
_ = conn.ConsumeLogs(context.Background(), lr)
stableIndex := conn.failover.pS.CurrentPipeline()
Expand Down
6 changes: 6 additions & 0 deletions connector/failoverconnector/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,12 @@ func (f *metricsRouter) consumeByHealthyPipeline(ctx context.Context, md pmetric
}

if err := tc.ConsumeMetrics(ctx, md); err != nil {
if idx > 0 && idx == len(f.cfg.PipelinePriority)-1 {
if f.sampleRetryConsumers(ctx, md) {
return nil
}
}

f.reportConsumerError(idx)
continue
}
Expand Down
42 changes: 42 additions & 0 deletions connector/failoverconnector/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,48 @@ func TestMetricsWithQueue(t *testing.T) {
assert.NoError(t, conn.ConsumeMetrics(t.Context(), md))
}

func TestMetricsImmediateRetryOnLastPipelineFailure(t *testing.T) {
var sinkFirst, sinkSecond consumertest.MetricsSink
metricsFirst := pipeline.NewIDWithName(pipeline.SignalMetrics, "metrics/first")
metricsSecond := pipeline.NewIDWithName(pipeline.SignalMetrics, "metrics/second")

cfg := &Config{
PipelinePriority: [][]pipeline.ID{{metricsFirst}, {metricsSecond}},
RetryInterval: 5 * time.Minute,
}

router := connector.NewMetricsRouter(map[pipeline.ID]consumer.Metrics{
metricsFirst: &sinkFirst,
metricsSecond: &sinkSecond,
})

conn, err := NewFactory().CreateMetricsToMetrics(t.Context(),
connectortest.NewNopSettings(metadata.Type), cfg, router.(consumer.Metrics))
require.NoError(t, err)

failoverConnector := conn.(*metricsFailover)
defer func() {
assert.NoError(t, failoverConnector.Shutdown(t.Context()))
}()

md := sampleMetric()

// 1st pipeline failing logic
failoverConnector.failover.ModifyConsumerAtIndex(0, consumertest.NewErr(errMetricsConsumer))

err = failoverConnector.ConsumeMetrics(t.Context(), md)
assert.NoError(t, err)
assert.Equal(t, 1, failoverConnector.failover.pS.CurrentPipeline())

// making 2nd pipeline to fail and recover the 1st one
failoverConnector.failover.ModifyConsumerAtIndex(1, consumertest.NewErr(errMetricsConsumer))
failoverConnector.failover.ModifyConsumerAtIndex(0, &sinkFirst)

err = failoverConnector.ConsumeMetrics(t.Context(), md)
assert.NoError(t, err)
assert.Equal(t, 0, failoverConnector.failover.pS.CurrentPipeline())
}

func consumeMetricsAndCheckStable(conn *metricsFailover, idx int, mr pmetric.Metrics) bool {
_ = conn.ConsumeMetrics(context.Background(), mr)
stableIndex := conn.failover.pS.CurrentPipeline()
Expand Down
6 changes: 6 additions & 0 deletions connector/failoverconnector/traces.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ func (f *tracesRouter) consumeByHealthyPipeline(ctx context.Context, td ptrace.T
}

if err := tc.ConsumeTraces(ctx, td); err != nil {
if idx > 0 && idx == len(f.cfg.PipelinePriority)-1 {
if f.sampleRetryConsumers(ctx, td) {
return nil
}
}

f.reportConsumerError(idx)
continue
}
Expand Down
42 changes: 42 additions & 0 deletions connector/failoverconnector/traces_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,48 @@ func TestTracesWithQueue(t *testing.T) {
assert.NoError(t, wrappedConn.ConsumeTraces(t.Context(), tr))
}

func TestTracesImmediateRetryOnLastPipelineFailure(t *testing.T) {
var sinkFirst, sinkSecond consumertest.TracesSink
tracesFirst := pipeline.NewIDWithName(pipeline.SignalTraces, "traces/first")
tracesSecond := pipeline.NewIDWithName(pipeline.SignalTraces, "traces/second")

cfg := &Config{
PipelinePriority: [][]pipeline.ID{{tracesFirst}, {tracesSecond}},
RetryInterval: 5 * time.Minute,
}

router := connector.NewTracesRouter(map[pipeline.ID]consumer.Traces{
tracesFirst: &sinkFirst,
tracesSecond: &sinkSecond,
})

conn, err := NewFactory().CreateTracesToTraces(t.Context(),
connectortest.NewNopSettings(metadata.Type), cfg, router.(consumer.Traces))
require.NoError(t, err)

failoverConnector := conn.(*tracesFailover)
defer func() {
assert.NoError(t, failoverConnector.Shutdown(t.Context()))
}()

tr := sampleTrace()

// forcing 1st pipeline to fal
failoverConnector.failover.ModifyConsumerAtIndex(0, consumertest.NewErr(errTracesConsumer))

err = failoverConnector.ConsumeTraces(t.Context(), tr)
assert.NoError(t, err)
assert.Equal(t, 1, failoverConnector.failover.pS.CurrentPipeline())

// making the 2nd pipeline to fail and recovering the 1st one
failoverConnector.failover.ModifyConsumerAtIndex(1, consumertest.NewErr(errTracesConsumer))
failoverConnector.failover.ModifyConsumerAtIndex(0, &sinkFirst)

err = failoverConnector.ConsumeTraces(t.Context(), tr)
assert.NoError(t, err)
assert.Equal(t, 0, failoverConnector.failover.pS.CurrentPipeline())
}

func consumeTracesAndCheckStable(router *tracesRouter, idx int, tr ptrace.Traces) bool {
_ = router.Consume(context.Background(), tr)
stableIndex := router.pS.CurrentPipeline()
Expand Down
Loading