diff --git a/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs new file mode 100644 index 0000000000..24e5c801cf --- /dev/null +++ b/Microsoft.Azure.Cosmos/src/MetadataRetryHelper.cs @@ -0,0 +1,246 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +namespace Microsoft.Azure.Cosmos +{ + using System; + using System.Runtime.ExceptionServices; + using System.Threading; + using System.Threading.Tasks; + using Microsoft.Azure.Cosmos.Core.Trace; + using Microsoft.Azure.Documents; + + /// + /// Retry helper for internal metadata (control-plane) reads such as + /// .ReadCollectionAsync. + /// + /// The shared BackoffRetryUtility evaluates the caller's + /// before consulting ShouldRetryAsync. If the + /// caller's timeout trips during the control-plane HTTP timeout escalation + /// (0.5s → 5s → 30s ≈ 36s) for an unhealthy region, the cross-region failover + /// that ClientRetryPolicy would otherwise execute is preempted and the + /// operation surfaces an . + /// + /// This helper runs a bounded retry loop that always consults the retry policy + /// on exception and, when the caller's token has already been cancelled, grants + /// a small bounded grace window so the cross-region failover attempt can run. + /// The grace window is intentionally short — the goal is best-effort availability, + /// not unbounded timeout extension. If the retry attempt does not complete within + /// the grace window, the original exception is rethrown. + /// + /// Note on the grace bound: the grace controls + /// when the grace attempt may START (via ThrowIfCancellationRequested and + /// propagation into the operation lambda). If the underlying operation does not + /// observe the grace token (e.g. the store model ignores cancellation), the in-flight + /// call may exceed the grace window. Callers relying on a strict upper bound should + /// ensure the operation honors its . + /// + internal static class MetadataRetryHelper + { + /// + /// Maximum additional time granted to a metadata read after the caller's + /// cancellation token has tripped so that cross-region retry can execute. + /// + internal static readonly TimeSpan DefaultCrossRegionRetryGrace = TimeSpan.FromSeconds(10); + + /// + /// Defensive upper bound on the number of attempts the helper makes within a single + /// + /// invocation. and peers already bound their own retry + /// counts, but a misconfigured policy that always returns ShouldRetry=true would + /// otherwise spin this loop indefinitely. + /// + private const int MaxAttemptsHardCap = 20; + + internal static Task ExecuteAsync( + Func> operation, + IDocumentClientRetryPolicy retryPolicy, + CancellationToken cancellationToken) + { + return ExecuteAsync(operation, retryPolicy, DefaultCrossRegionRetryGrace, cancellationToken); + } + + internal static async Task ExecuteAsync( + Func> operation, + IDocumentClientRetryPolicy retryPolicy, + TimeSpan crossRegionRetryGrace, + CancellationToken cancellationToken) + { + if (operation == null) + { + throw new ArgumentNullException(nameof(operation)); + } + + if (retryPolicy == null) + { + throw new ArgumentNullException(nameof(retryPolicy)); + } + + if (crossRegionRetryGrace < TimeSpan.Zero) + { + throw new ArgumentOutOfRangeException( + nameof(crossRegionRetryGrace), + "Cross-region retry grace must not be negative."); + } + + // Contract: the operation lambda MUST honor the CancellationToken passed to it + // and MUST NOT close over any outer CancellationToken. On a grace retry attempt + // this will be a fresh, bounded-lifetime token decoupled from the caller's + // (already cancelled) token. Closing over the outer token re-introduces the + // defect this helper is designed to fix. + bool graceAttempted = false; + int attemptCount = 0; + while (true) + { + if (++attemptCount > MaxAttemptsHardCap) + { + DefaultTrace.TraceError( + "MetadataRetryHelper: exceeded hard attempt cap ({0}). Surfacing last exception.", + MaxAttemptsHardCap); + throw new InvalidOperationException( + $"MetadataRetryHelper exceeded the defensive attempt cap of {MaxAttemptsHardCap}. " + + "This indicates a misconfigured retry policy that returns ShouldRetry=true indefinitely."); + } + + ExceptionDispatchInfo capturedException; + try + { + return await operation(cancellationToken).ConfigureAwait(false); + } + catch (Exception ex) + { + capturedException = ExceptionDispatchInfo.Capture(ex); + } + + // Always consult the retry policy BEFORE honoring caller cancellation. + // This is the correctness fix: BackoffRetryUtility honors cancellation + // first, which silently swallows cross-region failover decisions for + // metadata reads when the caller's timeout trips during the HTTP + // timeout policy escalation. + ShouldRetryResult shouldRetry; + try + { + shouldRetry = await retryPolicy + .ShouldRetryAsync(capturedException.SourceException, CancellationToken.None) + .ConfigureAwait(false); + } + catch (Exception policyException) + { + DefaultTrace.TraceWarning( + "MetadataRetryHelper: retry policy threw while evaluating exception {0}: {1}", + capturedException.SourceException.GetType().Name, + policyException.Message); + capturedException.Throw(); + throw; // unreachable + } + + if (shouldRetry == null || !shouldRetry.ShouldRetry) + { + // Honor ShouldRetryResult.ExceptionToThrow if the policy has specified a + // wrapper/translated exception (matches BackoffRetryUtility.ThrowIfDoneTrying). + // When the policy hands back the same reference as the original exception, + // rethrow via the captured ExceptionDispatchInfo so the original stack trace + // is preserved (mirrors ShouldRetryResult.ThrowIfDoneTrying). + if (shouldRetry?.ExceptionToThrow != null) + { + if (shouldRetry.ExceptionToThrow == capturedException.SourceException) + { + capturedException.Throw(); + } + + throw shouldRetry.ExceptionToThrow; + } + + capturedException.Throw(); + } + + // If the caller has not cancelled, honor any backoff and continue the loop. + if (!cancellationToken.IsCancellationRequested) + { + if (shouldRetry.BackoffTime > TimeSpan.Zero) + { + try + { + await Task.Delay(shouldRetry.BackoffTime, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + // Caller cancelled during backoff. Fall through — we will + // attempt one bounded cross-region retry below. + } + } + + if (!cancellationToken.IsCancellationRequested) + { + continue; + } + } + + // Caller token is cancelled. Grant a single bounded grace window for + // a cross-region retry so availability-critical failover is not + // silently preempted. The grace token is passed to the operation so + // the underlying HTTP call is not itself preempted by the caller's + // already-cancelled token. Subsequent cancellations collapse to the + // original exception so we do not extend the caller's timeout unbounded. + if (graceAttempted || crossRegionRetryGrace <= TimeSpan.Zero) + { + DefaultTrace.TraceInformation( + "MetadataRetryHelper: caller token cancelled; cross-region retry grace already used or disabled. Surfacing original exception."); + capturedException.Throw(); + } + + graceAttempted = true; + DefaultTrace.TraceVerbose( + "MetadataRetryHelper: caller token cancelled; granting {0}ms grace for one cross-region metadata retry.", + (int)crossRegionRetryGrace.TotalMilliseconds); + + using (CancellationTokenSource graceCts = new CancellationTokenSource(crossRegionRetryGrace)) + { + try + { + return await ExecuteSingleAttemptWithGraceAsync( + operation, + shouldRetry.BackoffTime, + graceCts.Token).ConfigureAwait(false); + } + catch (OperationCanceledException) when (graceCts.IsCancellationRequested) + { + // Grace window expired before the cross-region attempt completed. + // Surface the original failure rather than the grace-timeout. + DefaultTrace.TraceWarning( + "MetadataRetryHelper: grace window ({0}ms) expired during cross-region retry. Surfacing original exception.", + (int)crossRegionRetryGrace.TotalMilliseconds); + capturedException.Throw(); + throw; // unreachable + } + catch (Exception graceException) + { + // Cross-region attempt itself failed. Surface the ORIGINAL exception so + // callers see the pre-failover failure mode, not the grace-region failure. + DefaultTrace.TraceWarning( + "MetadataRetryHelper: grace-region retry failed with {0}: {1}. Surfacing original exception.", + graceException.GetType().Name, + graceException.Message); + capturedException.Throw(); + throw; // unreachable + } + } + } + } + + private static async Task ExecuteSingleAttemptWithGraceAsync( + Func> operation, + TimeSpan backoff, + CancellationToken graceToken) + { + if (backoff > TimeSpan.Zero) + { + await Task.Delay(backoff, graceToken).ConfigureAwait(false); + } + + graceToken.ThrowIfCancellationRequested(); + return await operation(graceToken).ConfigureAwait(false); + } + } +} diff --git a/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs b/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs index cc8b5a1bb6..b05167053b 100644 --- a/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs +++ b/Microsoft.Azure.Cosmos/src/Routing/ClientCollectionCache.cs @@ -53,15 +53,16 @@ protected override Task GetByRidAsync(string apiVersion, cancellationToken.ThrowIfCancellationRequested(); IDocumentClientRetryPolicy retryPolicyInstance = new ClearingSessionContainerClientRetryPolicy( this.sessionContainer, this.retryPolicy.GetRequestPolicy()); - return TaskHelper.InlineIfPossible( - () => this.ReadCollectionAsync( - PathsHelper.GeneratePath(ResourceType.Collection, collectionRid, false), + return TaskHelper.RunInlineIfNeededAsync( + () => MetadataRetryHelper.ExecuteAsync( + (ct) => this.ReadCollectionAsync( + PathsHelper.GeneratePath(ResourceType.Collection, collectionRid, false), + retryPolicyInstance, + trace, + clientSideRequestStatistics, + ct), retryPolicyInstance, - trace, - clientSideRequestStatistics, - cancellationToken), - retryPolicyInstance, - cancellationToken); + cancellationToken)); } protected override Task GetByNameAsync(string apiVersion, @@ -73,11 +74,12 @@ protected override Task GetByNameAsync(string apiVersion, cancellationToken.ThrowIfCancellationRequested(); IDocumentClientRetryPolicy retryPolicyInstance = new ClearingSessionContainerClientRetryPolicy( this.sessionContainer, this.retryPolicy.GetRequestPolicy()); - return TaskHelper.InlineIfPossible( - () => this.ReadCollectionAsync( - resourceAddress, retryPolicyInstance, trace, clientSideRequestStatistics, cancellationToken), - retryPolicyInstance, - cancellationToken); + return TaskHelper.RunInlineIfNeededAsync( + () => MetadataRetryHelper.ExecuteAsync( + (ct) => this.ReadCollectionAsync( + resourceAddress, retryPolicyInstance, trace, clientSideRequestStatistics, ct), + retryPolicyInstance, + cancellationToken)); } internal override Task ResolveByNameAsync( diff --git a/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs new file mode 100644 index 0000000000..a513a22af6 --- /dev/null +++ b/Microsoft.Azure.Cosmos/tests/Microsoft.Azure.Cosmos.Tests/MetadataRetryHelperTests.cs @@ -0,0 +1,608 @@ +//------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +//------------------------------------------------------------ + +namespace Microsoft.Azure.Cosmos.Tests +{ + using System; + using System.Net; + using System.Threading; + using System.Threading.Tasks; + using Microsoft.Azure.Documents; + using Microsoft.VisualStudio.TestTools.UnitTesting; + using Moq; + + /// + /// Unit tests for . + /// + /// These tests reproduce the control-plane metadata retry defect described in + /// PR #5787 — when a caller's cancellation token trips during the control-plane + /// HTTP timeout escalation against an unhealthy region, the cross-region retry + /// that would otherwise execute is preempted. + /// + /// Each test is written so that the pre-fix implementation (TaskHelper.InlineIfPossible + /// → BackoffRetryUtility) would FAIL the assertion, while the fixed implementation + /// (MetadataRetryHelper) passes. + /// + [TestClass] + public class MetadataRetryHelperTests + { + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_SucceedsFirstAttempt_NoRetry() + { + Mock policy = new Mock(); + int attempts = 0; + + int result = await MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + return Task.FromResult(42); + }, + policy.Object, + CancellationToken.None); + + Assert.AreEqual(42, result); + Assert.AreEqual(1, attempts); + policy.Verify( + p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()), + Times.Never); + } + + /// + /// Exercises the classic retry loop: exception, policy says retry, second attempt + /// succeeds. No cancellation involved. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_RetriesOnTransient_WhenPolicyAllows() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + int attempts = 0; + + int result = await MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + if (attempts == 1) + { + throw new DocumentClientException( + "transient", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + } + + return Task.FromResult(7); + }, + policy.Object, + CancellationToken.None); + + Assert.AreEqual(7, result); + Assert.AreEqual(2, attempts); + } + + /// + /// REPRO / FIX VERIFICATION: the caller's cancellation token is already cancelled + /// at the time the first attempt throws a 503. The retry policy says "retry" + /// (cross-region failover). The fix must still execute one cross-region attempt + /// against the detached grace token. + /// + /// Pre-fix behavior (BackoffRetryUtility): throws OperationCanceledException + /// without consulting the policy, or consults the policy but then immediately + /// honors the cancelled token before the retry attempt. Either way, attempts == 1. + /// + /// Post-fix behavior (MetadataRetryHelper): attempts == 2 and the second attempt + /// returns successfully. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_CrossRegionRetryExecutes_EvenWhenCallerTokenCancelled() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + CancellationToken secondAttemptToken = default; + + int result = await MetadataRetryHelper.ExecuteAsync( + (ct) => + { + attempts++; + if (attempts == 1) + { + throw new DocumentClientException( + "503 from unhealthy region", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + } + + // Capture the token observed by the grace attempt so the detached-token + // contract can be asserted below. + secondAttemptToken = ct; + return Task.FromResult(100); + }, + policy.Object, + MetadataRetryHelper.DefaultCrossRegionRetryGrace, + cts.Token); + + Assert.AreEqual(100, result, "Cross-region retry should have executed despite cancelled caller token."); + Assert.AreEqual(2, attempts, "Exactly one cross-region retry attempt should run on the grace token."); + + // Pin the detached-grace-token contract: the grace attempt MUST NOT receive the + // caller's already-cancelled token. A future refactor that accidentally passes + // cancellationToken (instead of graceCts.Token) would silently reintroduce the + // defect this helper was built to fix — this assertion catches that regression. + Assert.IsFalse( + secondAttemptToken.IsCancellationRequested, + "Grace attempt must receive a fresh, non-cancelled token decoupled from the caller's cancelled token."); + Assert.AreNotEqual( + cts.Token, + secondAttemptToken, + "Grace attempt token must not be the caller's cancellation token."); + } + + /// + /// Verifies the bounded grace: if the caller's token is cancelled AND the cross-region + /// retry attempt itself also fails with a retriable error, the helper does not loop + /// indefinitely. It grants at most one grace window and then surfaces the original + /// exception (not the grace-timeout exception). + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_GraceIsBounded_SurfacesOriginalExceptionOnSecondFailure() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + DocumentClientException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + $"503 attempt {attempts}", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + }, + policy.Object, + MetadataRetryHelper.DefaultCrossRegionRetryGrace, + cts.Token)); + + Assert.AreEqual(2, attempts, "Exactly one grace-window retry should run; then original is surfaced."); + StringAssert.Contains(thrown.Message, "503 attempt 1"); + } + + /// + /// Caller token cancelled and the retry policy says "no retry" (e.g. 404, throttling exhausted). + /// The helper must surface the original exception, not OperationCanceledException. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_PolicyDeniesRetry_SurfacesOriginalExceptionEvenWhenCancelled() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.NoRetry()); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + DocumentClientException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + "404", + HttpStatusCode.NotFound, + SubStatusCodes.Unknown); + }, + policy.Object, + MetadataRetryHelper.DefaultCrossRegionRetryGrace, + cts.Token)); + + Assert.AreEqual(1, attempts); + StringAssert.Contains(thrown.Message, "404"); + } + + /// + /// If the grace window is zero and the caller token is cancelled, the helper must + /// not attempt a cross-region retry. Original exception is surfaced on the first failure. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_ZeroGrace_DoesNotRetryAfterCancellation() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + DocumentClientException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + "503", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + }, + policy.Object, + TimeSpan.Zero, + cts.Token)); + + Assert.AreEqual(1, attempts); + StringAssert.Contains(thrown.Message, "503"); + } + + /// + /// If the grace window itself expires while the cross-region attempt is still in flight, + /// the helper surfaces the ORIGINAL underlying exception (not the grace-timeout OCE). + /// This prevents leaking internal cancellation semantics to callers. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_GraceExpires_SurfacesOriginalException() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + DocumentClientException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + async (ct) => + { + attempts++; + if (attempts == 1) + { + throw new DocumentClientException( + "original 503", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + } + + // Second attempt: observes the grace token and completes deterministically + // only when the grace CTS fires. Using TaskCompletionSource + ct.Register + // avoids the long Task.Delay worst-case on slow CI runners. + TaskCompletionSource tcs = new TaskCompletionSource( + TaskCreationOptions.RunContinuationsAsynchronously); + using (ct.Register(() => tcs.TrySetCanceled(ct))) + { + return await tcs.Task.ConfigureAwait(false); + } + }, + policy.Object, + TimeSpan.FromMilliseconds(100), + cts.Token)); + + Assert.AreEqual(2, attempts); + StringAssert.Contains(thrown.Message, "original 503"); + } + + /// + /// Exercises the mid-backoff cancellation → grace transition. The caller's token + /// is live at the start of the first attempt (so the outer IsCancellationRequested + /// check on the backoff path is false), then trips during Task.Delay. The + /// OperationCanceledException from the delay is caught and the inner + /// IsCancellationRequested check routes execution into the bounded grace + /// window — which then succeeds. + /// + /// This is the most realistic production trigger for the grace path (caller's + /// ~36s timeout fires while the helper waits out a retry backoff after a 503 from + /// an unhealthy region). All other cancellation tests use a pre-cancelled token, + /// which short-circuits the outer check and never exercises the catch block or + /// the second cancellation check. A refactor that drops either guard would + /// regress one of these branches without any other test catching it. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_CancellationDuringBackoff_TransitionsToGraceRetry() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.FromSeconds(5))); + + using CancellationTokenSource cts = new CancellationTokenSource(); + int attempts = 0; + CancellationToken graceAttemptToken = default; + + int result = await MetadataRetryHelper.ExecuteAsync( + (ct) => + { + attempts++; + if (attempts == 1) + { + // Token is live now (outer IsCancellationRequested check on the + // backoff path will be false). Schedule a cancellation that fires + // during Task.Delay(5s, cancellationToken) — this is the production + // case where the caller's timeout trips mid-backoff. + cts.CancelAfter(TimeSpan.FromMilliseconds(50)); + throw new DocumentClientException( + "503 from unhealthy region", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + } + + graceAttemptToken = ct; + return Task.FromResult(99); + }, + policy.Object, + MetadataRetryHelper.DefaultCrossRegionRetryGrace, + cts.Token); + + Assert.AreEqual(99, result, "Mid-backoff cancellation must transition to grace retry, not surface OCE."); + Assert.AreEqual(2, attempts, "Exactly one grace-window retry should run after mid-backoff cancellation."); + + // The grace attempt must receive a fresh token decoupled from the caller's + // already-cancelled token. Same contract pinned by + // ExecuteAsync_CrossRegionRetryExecutes_EvenWhenCallerTokenCancelled, but + // re-asserted here because this path reaches the grace branch via the + // OperationCanceledException catch rather than the pre-cancelled outer check. + Assert.IsFalse( + graceAttemptToken.IsCancellationRequested, + "Grace attempt must receive a fresh, non-cancelled token after mid-backoff cancellation."); + Assert.AreNotEqual( + cts.Token, + graceAttemptToken, + "Grace attempt token must not be the caller's cancellation token."); + } + + /// + /// COMPANION REPRO: exercises the legacy TaskHelper.InlineIfPossible + + /// BackoffRetryUtility path that ClientCollectionCache used prior to the + /// fix. It demonstrates that, under the same conditions where the fixed + /// succeeds with a cross-region retry + /// (), + /// the legacy path throws without ever + /// executing the cross-region attempt that the retry policy requested. + /// + /// This test is the canonical proof of Defect A and ensures that if anyone + /// reverts the fix by calling TaskHelper.InlineIfPossible for a metadata + /// read in ClientCollectionCache, the bug surfaces here. + /// + [TestMethod] + [Owner("ntripician")] + public async Task LegacyBackoffRetryUtility_CrossRegionRetryIsPreempted_ByCancelledCallerToken() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + await Assert.ThrowsExceptionAsync( + () => TaskHelper.InlineIfPossible( + () => + { + attempts++; + if (attempts == 1) + { + throw new DocumentClientException( + "503 from unhealthy region", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + } + + return Task.FromResult(100); + }, + policy.Object, + cts.Token)); + + Assert.IsTrue( + attempts <= 1, + $"Legacy BackoffRetryUtility path preempts cross-region retry once caller token is cancelled — " + + $"expected at most one attempt (possibly zero if ThrowIfCancellationRequested fires first), got {attempts}. " + + $"If this starts asserting on attempts==2, someone improved the shared retry utility to consult the policy " + + $"before honoring cancellation — in that case the bespoke MetadataRetryHelper may be removable."); + } + + /// + /// Regression guard: when the very first operation attempt throws + /// (caller token already cancelled and the + /// inner HTTP call honors it immediately), the helper must still consult the retry + /// policy before surfacing the OCE. If the policy returns NoRetry, the OCE is + /// surfaced. This test pins the "policy is consulted FIRST" invariant. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_FirstAttemptOCE_PolicyIsConsultedBeforeSurfacing() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.NoRetry()); + + using CancellationTokenSource cts = new CancellationTokenSource(); + cts.Cancel(); + + int attempts = 0; + + await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (ct) => + { + attempts++; + ct.ThrowIfCancellationRequested(); + return Task.FromResult(42); + }, + policy.Object, + cts.Token)); + + Assert.AreEqual(1, attempts); + policy.Verify( + p => p.ShouldRetryAsync(It.IsAny(), It.IsAny()), + Times.AtLeastOnce, + "Retry policy must be consulted even on first-attempt OCE. If not, the helper's core invariant is broken."); + } + + /// + /// Regression guard for finding #5: when + /// is invoked with a policy-specified translated exception, the helper must surface + /// THAT exception (matching BackoffRetryUtility.ThrowIfDoneTrying), not the + /// original captured exception. If this invariant is broken, retry policies that rewrite + /// error types (e.g. NonRetriableInvalidPartitionExceptionRetryPolicy translating + /// a gone/invalid-partition into a NotFoundException) would silently diverge in + /// behavior when wired through . + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_PolicySpecifiesExceptionToThrow_SurfacesTranslatedException() + { + InvalidOperationException translated = new InvalidOperationException("policy-translated"); + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.NoRetry(translated)); + + int attempts = 0; + + InvalidOperationException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + "original 503", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + }, + policy.Object, + CancellationToken.None)); + + Assert.AreEqual(1, attempts); + Assert.AreSame( + translated, + thrown, + "Helper must surface ShouldRetryResult.ExceptionToThrow (policy-translated), not the original exception."); + } + + /// + /// Negative grace windows are invalid and must be rejected at the API boundary rather + /// than silently collapsed to "disabled". This prevents subtle misuse. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_NegativeGrace_Throws() + { + Mock policy = new Mock(); + + await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (ct) => Task.FromResult(1), + policy.Object, + TimeSpan.FromSeconds(-1), + CancellationToken.None)); + } + + /// + /// Regression guard for the defensive MaxAttemptsHardCap. A misconfigured retry + /// policy that always returns would otherwise + /// spin indefinitely. The helper must terminate with an + /// after the hard cap (20) is exceeded. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_ExceedsMaxAttemptsHardCap_ThrowsInvalidOperationException() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ReturnsAsync(ShouldRetryResult.RetryAfter(TimeSpan.Zero)); + + int attempts = 0; + + InvalidOperationException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + "always fail", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + }, + policy.Object, + CancellationToken.None)); + + Assert.AreEqual(20, attempts, "Helper must stop at the defensive attempt cap of 20."); + StringAssert.Contains(thrown.Message, "defensive attempt cap"); + } + + /// + /// Regression guard: when the retry policy itself throws while evaluating + /// ShouldRetryAsync, the helper must surface the ORIGINAL operation + /// exception (with its stack trace preserved) rather than the policy's + /// internal failure. The policy error is logged but should not be visible + /// to callers — diagnosing a metadata read failure should never be + /// distorted by a buggy retry policy. + /// + [TestMethod] + [Owner("ntripician")] + public async Task ExecuteAsync_PolicyThrowsDuringShouldRetry_SurfacesOriginalException() + { + Mock policy = new Mock(); + policy + .Setup(p => p.ShouldRetryAsync(It.IsAny(), It.IsAny())) + .ThrowsAsync(new InvalidOperationException("policy internal error")); + + int attempts = 0; + + DocumentClientException thrown = await Assert.ThrowsExceptionAsync( + () => MetadataRetryHelper.ExecuteAsync( + (_) => + { + attempts++; + throw new DocumentClientException( + "original 503", + HttpStatusCode.ServiceUnavailable, + SubStatusCodes.Unknown); + }, + policy.Object, + CancellationToken.None)); + + Assert.AreEqual(1, attempts); + StringAssert.Contains( + thrown.Message, + "original 503", + "Helper must surface the ORIGINAL operation exception, not the policy's internal failure."); + } + } +} +