-
Notifications
You must be signed in to change notification settings - Fork 4.6k
rls: only reset backoff on recovery from TRANSIENT_FAILURE #8720
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
a6fcb7e
ca49ae7
c7eb618
943240d
ed5ab2c
2ad8249
faee5a0
5dcc02c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -187,6 +187,11 @@ func (cc *controlChannel) monitorConnectivityState() { | |
| cc.connectivityStateCh.Load() | ||
| cc.logger.Infof("Connectivity state is READY") | ||
|
|
||
| // Track whether we've seen TRANSIENT_FAILURE since the last READY state. | ||
| // We only want to reset backoff when recovering from an actual failure, | ||
| // not when transitioning through benign states like IDLE. | ||
| seenTransientFailure := false | ||
|
|
||
| for { | ||
| s, ok := <-cc.connectivityStateCh.Get() | ||
| if !ok { | ||
|
|
@@ -197,9 +202,21 @@ func (cc *controlChannel) monitorConnectivityState() { | |
| if s == connectivity.Shutdown { | ||
| return | ||
| } | ||
|
|
||
| // Track if we've entered TRANSIENT_FAILURE state | ||
| if s == connectivity.TransientFailure { | ||
| seenTransientFailure = true | ||
| } | ||
|
|
||
| // Only reset backoff if we're returning to READY after a failure | ||
| if s == connectivity.Ready { | ||
| cc.logger.Infof("Control channel back to READY") | ||
| cc.backToReadyFunc() | ||
| if seenTransientFailure { | ||
| cc.logger.Infof("Control channel back to READY after TRANSIENT_FAILURE") | ||
| cc.backToReadyFunc() | ||
| seenTransientFailure = false | ||
| } else { | ||
| cc.logger.Infof("Control channel back to READY (no prior failure)") | ||
|
||
| } | ||
| } | ||
|
|
||
| cc.logger.Infof("Connectivity state is %s", s) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,13 +26,15 @@ import ( | |
| "fmt" | ||
| "os" | ||
| "regexp" | ||
| "sync" | ||
| "testing" | ||
| "time" | ||
|
|
||
| "github.com/google/go-cmp/cmp" | ||
| "google.golang.org/grpc" | ||
| "google.golang.org/grpc/balancer" | ||
| "google.golang.org/grpc/codes" | ||
| "google.golang.org/grpc/connectivity" | ||
| "google.golang.org/grpc/credentials" | ||
| "google.golang.org/grpc/internal" | ||
| rlspb "google.golang.org/grpc/internal/proto/grpc_lookup_v1" | ||
|
|
@@ -463,3 +465,94 @@ func (s) TestNewControlChannelUnsupportedCredsBundle(t *testing.T) { | |
| t.Fatal("newControlChannel succeeded when expected to fail") | ||
| } | ||
| } | ||
|
|
||
| // TestControlChannelConnectivityStateTransitions verifies that the control | ||
| // channel only resets backoff when recovering from TRANSIENT_FAILURE, not | ||
| // when going through benign state changes like READY → IDLE → READY. | ||
| func (s) TestControlChannelConnectivityStateTransitions(t *testing.T) { | ||
| tests := []struct { | ||
| name string | ||
| states []connectivity.State | ||
| wantCallbackCount int | ||
| }{ | ||
| { | ||
| name: "READY → TRANSIENT_FAILURE → READY triggers callback", | ||
|
||
| states: []connectivity.State{ | ||
| connectivity.TransientFailure, | ||
| connectivity.Ready, | ||
| }, | ||
| wantCallbackCount: 1, | ||
| }, | ||
| { | ||
| name: "READY → IDLE → READY does not trigger callback", | ||
| states: []connectivity.State{ | ||
| connectivity.Idle, | ||
| connectivity.Ready, | ||
| }, | ||
| wantCallbackCount: 0, | ||
| }, | ||
| { | ||
| name: "Multiple failures trigger callback each time", | ||
| states: []connectivity.State{ | ||
| connectivity.TransientFailure, | ||
| connectivity.Ready, | ||
| connectivity.TransientFailure, | ||
| connectivity.Ready, | ||
| }, | ||
| wantCallbackCount: 2, | ||
| }, | ||
| { | ||
| name: "IDLE between failures doesn't affect callback", | ||
| states: []connectivity.State{ | ||
| connectivity.TransientFailure, | ||
| connectivity.Idle, | ||
| connectivity.Ready, | ||
| }, | ||
| wantCallbackCount: 1, | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think there might be some way to improve the test. Maybe we can use waitGroups , but I will defer to @easwars for his opinion on this. |
||
| // Start an RLS server | ||
| rlsServer, _ := rlstest.SetupFakeRLSServer(t, nil) | ||
|
|
||
| // Setup callback to count invocations | ||
| callbackCount := 0 | ||
| var mu sync.Mutex | ||
| callback := func() { | ||
| mu.Lock() | ||
| callbackCount++ | ||
| mu.Unlock() | ||
| } | ||
|
|
||
| // Create control channel | ||
| ctrlCh, err := newControlChannel(rlsServer.Address, "", defaultTestTimeout, balancer.BuildOptions{}, callback) | ||
| if err != nil { | ||
| t.Fatalf("Failed to create control channel: %v", err) | ||
| } | ||
| defer ctrlCh.close() | ||
|
|
||
| // Give the channel time to reach initial READY state | ||
| time.Sleep(100 * time.Millisecond) | ||
|
|
||
| // Inject the test state sequence | ||
| for _, state := range tt.states { | ||
| ctrlCh.OnMessage(state) | ||
| // Give time for the monitoring goroutine to process the state | ||
| time.Sleep(50 * time.Millisecond) | ||
| } | ||
|
|
||
| // Give extra time for any pending callbacks | ||
| time.Sleep(100 * time.Millisecond) | ||
|
||
|
|
||
| mu.Lock() | ||
| gotCallbackCount := callbackCount | ||
| mu.Unlock() | ||
|
|
||
| if gotCallbackCount != tt.wantCallbackCount { | ||
| t.Errorf("Got %d callback invocations, want %d", gotCallbackCount, tt.wantCallbackCount) | ||
| } | ||
| }) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FYI: This is the exact text that describes the expected behavior:
The policy will monitor the state of the control plane channel. When the state transitions to TRANSIENT_FAILURE, it will record that transition, and the next time it transitions to READY, the policy will iterate through the cache to reset the backoff timeouts in all cache entries. Specifically, this means that it will reset the backoff state and cancel the pending backoff timer. Note that when cancelling the backoff timer, just like when the backoff timer fires normally, a new picker is returned to the channel, to force it to re-process any wait-for-ready RPCs that may still be queued if we failed them while we were in backoff. However, we should optimize this case by returning only one new picker, regardless of how many backoff timers are cancelled.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Based on the above text, we don't even have to wait for the first time the control channel goes READY. This means, that we can simplify the code quite a bit and not even have a control channel connectivity state monitoring goroutine. All we need is the following:
grpc-go/balancer/rls/control_channel.go
Line 91 in cdbafd3
grpcsync.Subscriberinterface, we currently push the received connectivity state update on to an unbounded buffer here:grpc-go/balancer/rls/control_channel.go
Line 104 in cdbafd3
forloop in the monitoring goroutine here:grpc-go/balancer/rls/control_channel.go
Line 177 in cdbafd3
The above if-elseif-else can also be implemented as a
switchand the linter might complain if that is not the case.