Skip to content

Commit 5d4dc23

Browse files
committed
NCBC-4201: Fix Stellar HTTP/2 retries and strict timeout enforcement
Motivation ========== Stellar operations lacked proper timeout enforcement and often suffered unbounded retry loops. Additionally, transient HTTP/2 transport errors (like connection resets) surfaced as fatal internal errors instead of being safely retried. Modifications ============ - Enabled multiple HTTP/2 connections in StellarCluster to recover cleanly from dead connections. - Enforced strict timeouts via dynamically shrinking `RemainingTimeout` in GrpcCallOptions, replacing broken retry-limit checks. - Delegated timeout enforcement entirely to `DeadlineExceeded`. - Mapped transient `HttpRequestException` and `IOException` transport errors to `ServiceNotAvailable` to enable graceful retries. Result ====== Transport resets now retry safely with backoff, matching other SDKs. Initial requests and all retries strictly enforce their timeouts. Change-Id: Id86e824769965f3985b677e236d57897e62f2f33 Reviewed-on: https://review.couchbase.org/c/couchbase-net-client/+/244798 Tested-by: Build Bot <build@couchbase.com> Reviewed-by: Jeffry Morris <jeffrymorris@gmail.com>
1 parent f424dc2 commit 5d4dc23

13 files changed

Lines changed: 803 additions & 207 deletions

File tree

src/Couchbase/Stellar/Core/Retry/StellarRequest.cs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,40 @@ namespace Couchbase.Stellar.Core.Retry;
1313
*/
1414
public class StellarRequest : IRequest
1515
{
16+
private readonly TimeProvider _timeProvider;
17+
18+
public StellarRequest() : this(TimeProvider.System) { }
19+
20+
/// <summary>
21+
/// Creates a StellarRequest with a custom TimeProvider for deterministic testing.
22+
/// </summary>
23+
internal StellarRequest(TimeProvider timeProvider)
24+
{
25+
_timeProvider = timeProvider;
26+
CreatedAt = _timeProvider.GetUtcNow().UtcDateTime;
27+
}
28+
1629
public uint Attempts { get; set; }
1730
public bool Idempotent { get; set; }
1831
public List<RetryReason> RetryReasons { get; set; } = new();
1932
public IRetryStrategy RetryStrategy { get; set; } = new BestEffortRetryStrategy();
2033
public TimeSpan Timeout { get; set; }
2134
public TimeSpan Elapsed { get; }
35+
36+
/// <summary>
37+
/// The time this request was created. Used to compute remaining timeout.
38+
/// </summary>
39+
public DateTime CreatedAt { get; }
40+
41+
/// <summary>
42+
/// Returns the remaining time before this request should time out,
43+
/// or null if no timeout was set (Timeout is zero).
44+
/// Used to set shrinking gRPC deadlines on each retry attempt.
45+
/// </summary>
46+
public TimeSpan? RemainingTimeout =>
47+
Timeout > TimeSpan.Zero
48+
? Timeout - (_timeProvider.GetUtcNow().UtcDateTime - CreatedAt)
49+
: null;
2250
public CancellationToken Token { get; set; }
2351
public string? ClientContextId { get; set; }
2452
public string? Statement { get; set; }
@@ -40,5 +68,4 @@ public void LogOrphaned()
4068
}
4169

4270
public GenericErrorContext Context { get; set; } = new();
43-
public int MaxRetryAttempts { get; set; } = 10;
4471
}

src/Couchbase/Stellar/Core/Retry/StellarRetryHandler.cs

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
using System;
2+
using System.Net.Http;
23
using System.Threading.Tasks;
34
using Couchbase.Core;
45
using Couchbase.Core.Exceptions;
@@ -19,9 +20,21 @@ namespace Couchbase.Stellar.Core.Retry;
1920

2021
internal class StellarRetryHandler : IRetryOrchestrator
2122
{
23+
private readonly TimeProvider _timeProvider;
24+
25+
public StellarRetryHandler() : this(TimeProvider.System) { }
26+
27+
/// <summary>
28+
/// Creates a StellarRetryHandler with a custom TimeProvider for deterministic testing.
29+
/// </summary>
30+
internal StellarRetryHandler(TimeProvider timeProvider)
31+
{
32+
_timeProvider = timeProvider;
33+
}
34+
2235
public async Task<T> RetryAsync<T>(Func<Task<T>> send, IRequest request) where T : IServiceResult
2336
{
24-
var backoff = ControlledBackoff.Create();
37+
var backoff = ControlledBackoff.Create(_timeProvider);
2538
var context = new GenericErrorContext();
2639

2740
while (true)
@@ -48,6 +61,15 @@ public async Task<T> RetryAsync<T>(Func<Task<T>> send, IRequest request) where T
4861
await backoff.Delay(request).ConfigureAwait(false);
4962
}
5063
}
64+
catch (Exception e) when (IsTransientTransportException(e))
65+
{
66+
// HTTP/2 transport failures (e.g. connection reset, broken pipe) can
67+
// surface as HttpRequestException or IOException rather than RpcException.
68+
// Treat these like Unavailable and retry.
69+
context.RetryReasons.Add(RetryReason.ServiceNotAvailable);
70+
request.Attempts++;
71+
await backoff.Delay(request).ConfigureAwait(false);
72+
}
5173
}
5274
}
5375

@@ -205,6 +227,15 @@ private void HandleException(RpcException protoException, IRequest request, Gene
205227
if (request.Idempotent) throw new UnambiguousTimeoutException(detail);
206228
throw new AmbiguousTimeoutException();
207229
case StatusCode.Internal:
230+
if (IsTransientGrpcTransportError(protoException))
231+
{
232+
// HTTP/2 connection failures surface as StatusCode.Internal in .NET's
233+
// gRPC library. These are transient transport issues, not server errors.
234+
// Retry them like Unavailable, matching Java SDK behavior.
235+
context.RetryReasons.Add(RetryReason.ServiceNotAvailable);
236+
request.Attempts++;
237+
break;
238+
}
208239
throw new InternalServerFailureException(detail);
209240
case StatusCode.Unavailable:
210241
context.RetryReasons.Add(RetryReason.ServiceNotAvailable);
@@ -253,4 +284,32 @@ public Task<ResponseStatus> RetryAsync(BucketBase bucket, IOperation operation,
253284
{
254285
throw new NotImplementedException();
255286
}
287+
288+
/// <summary>
289+
/// Determines whether a gRPC Internal status error is actually a transient transport
290+
/// failure (e.g. HTTP/2 connection reset) rather than a genuine server-side error.
291+
/// In production, Grpc.Net.Client wraps transport exceptions (HttpRequestException,
292+
/// IOException) as the InnerException of the RpcException.
293+
/// </summary>
294+
private static bool IsTransientGrpcTransportError(RpcException ex)
295+
{
296+
return ex.InnerException is not null && IsTransientTransportException(ex.InnerException);
297+
}
298+
299+
/// <summary>
300+
/// Determines whether an exception (or any exception in its InnerException chain)
301+
/// represents a transient transport failure that should be retried.
302+
/// </summary>
303+
private static bool IsTransientTransportException(Exception? e)
304+
{
305+
while (e is not null)
306+
{
307+
if (e is HttpRequestException or System.IO.IOException)
308+
{
309+
return true;
310+
}
311+
e = e.InnerException;
312+
}
313+
return false;
314+
}
256315
}

0 commit comments

Comments
 (0)